diff --git a/.gitignore b/.gitignore
index f284581c..2401ffb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ other_test/
 MACS3/IO/BedGraphIO.c
 MACS3/IO/Parser.c
 MACS3/IO/PeakIO.c
+MACS3/IO/BAM.c
 MACS3/Signal/BedGraph.c
 MACS3/Signal/CallPeakUnit.c
 MACS3/Signal/FixWidthTrack.c
@@ -28,6 +29,7 @@ MACS3/Signal/PairedEndTrack.c
 MACS3/Signal/PeakDetect.c
 MACS3/Signal/PeakModel.c
 MACS3/Signal/Pileup.c
+MACS3/Signal/PileupV2.c
 MACS3/Signal/Prob.c
 MACS3/Signal/RACollection.c
 MACS3/Signal/ReadAlignment.c
@@ -37,6 +39,8 @@ MACS3/Signal/Signal.c
 MACS3/Signal/SignalProcessing.c
 MACS3/Signal/UnitigRACollection.c
 MACS3/Signal/VariantStat.c
+MACS3/Signal/PeakVariants.c
+MACS3/Signal/PosReadsInfo.c
 
 # MacOSX temp
 .DS_Store
diff --git a/ChangeLog b/ChangeLog
index 3b8c97df..7d00be76 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -3,7 +3,16 @@
 
 	* Features added
 
-	1) We extensively rewrote the `pyx` codes into `py` codes. In
+	1) We implemented the IO module for reading the fragment files
+	usually used in single-cell ATAC-seq experiment
+	`Parser.FragParser`. And we implemented a new
+	`PairedEndTrack.PETrackII` to store the data in fragment file,
+	including the barcodes and counts information. In the `PETrackII`
+	class, we are able to extract a subset using a list of barcodes,
+	which enables us to call peaks only on a pool (pseudo-bulk) of
+	cells.
+
+	2) We extensively rewrote the `pyx` codes into `py` codes. In
 	another words, we now apply the 'pure python style' with PEP-484
 	type annotations to our previous Cython style codes. So that, the
 	source codes can be more compatible to Python programming tools
diff --git a/MACS3/Commands/callvar_cmd.py b/MACS3/Commands/callvar_cmd.py
index 7f1a8097..cbd900fc 100644
--- a/MACS3/Commands/callvar_cmd.py
+++ b/MACS3/Commands/callvar_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2024-10-02 16:34:23 Tao Liu>
+# Time-stamp: <2024-10-11 10:28:07 Tao Liu>
 
 """Description: Call variants directly
 
@@ -137,11 +137,11 @@ def run(args):
 
     peakio = open(peakbedfile)
     peaks = PeakIO()
-    i = 0
+    #i = 0
     for t_peak in peakio:
         fs = t_peak.rstrip().split()
-        i += 1
-        peaks.add(fs[0].encode(), int(fs[1]), int(fs[2]), name=b"%d" % i)
+        # i += 1
+        peaks.add(fs[0].encode(), int(fs[1]), int(fs[2]))  # , name=b"%d" % i)
     peaks.sort()
 
     # chrs = peaks.get_chr_names()
diff --git a/MACS3/Commands/refinepeak_cmd.py b/MACS3/Commands/refinepeak_cmd.py
index 47f7610a..ba9a4939 100644
--- a/MACS3/Commands/refinepeak_cmd.py
+++ b/MACS3/Commands/refinepeak_cmd.py
@@ -1,4 +1,4 @@
-# Time-stamp: <2024-10-02 17:01:42 Tao Liu>
+# Time-stamp: <2024-10-11 11:11:00 Tao Liu>
 
 """Description: refine peak summits
 
diff --git a/MACS3/IO/Parser.py b/MACS3/IO/Parser.py
index 09fd6f81..49954944 100644
--- a/MACS3/IO/Parser.py
+++ b/MACS3/IO/Parser.py
@@ -1,7 +1,7 @@
 # cython: language_level=3
 # cython: profile=True
 # cython: linetrace=True
-# Time-stamp: <2024-10-07 16:08:43 Tao Liu>
+# Time-stamp: <2024-10-22 10:25:23 Tao Liu>
 
 """Module for all MACS Parser classes for input. Please note that the
 parsers are for reading the alignment files ONLY.
@@ -28,7 +28,7 @@
 
 from MACS3.Utilities.Constants import READ_BUFFER_SIZE
 from MACS3.Signal.FixWidthTrack import FWTrack
-from MACS3.Signal.PairedEndTrack import PETrackI
+from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII
 from MACS3.Utilities.Logger import logging
 
 logger = logging.getLogger(__name__)
@@ -199,7 +199,8 @@ def __init__(self, string, strand):
         self.string = string
 
     def __str__(self):
-        return repr("Strand information can not be recognized in this line: \"%s\",\"%s\"" % (self.string, self.strand))
+        return repr("Strand information can not be recognized in this line: \"%s\",\"%s\"" %
+                    (self.string, self.strand))
 
 
 @cython.cclass
@@ -544,7 +545,8 @@ def pe_parse_line(self, thisline: bytes):
                     atoi(thisfields[1]),
                     atoi(thisfields[2]))
         except IndexError:
-            raise Exception("Less than 3 columns found at this line: %s\n" % thisline)
+            raise Exception("Less than 3 columns found at this line: %s\n" %
+                            thisline)
 
     @cython.ccall
     def build_petrack(self):
@@ -950,7 +952,9 @@ def tlen_parse_line(self, thisline: bytes) -> cython.int:
         thisfields = thisline.split(b'\t')
         bwflag = atoi(thisfields[1])
         if bwflag & 4 or bwflag & 512 or bwflag & 256 or bwflag & 2048:
-            return 0       #unmapped sequence or bad sequence or 2nd or sup alignment
+            # unmapped sequence or bad sequence or 2nd or sup alignment
+            return 0
+
         if bwflag & 1:
             # paired read. We should only keep sequence if the mate is mapped
             # and if this is the left mate, all is within  the flag!
@@ -1068,9 +1072,11 @@ def __init__(self, filename: str,
         f.close()
         if self.gzipped:
             # open with gzip.open, then wrap it with BufferedReader!
-            self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'), buffer_size=READ_BUFFER_SIZE)  # buffersize set to 1M
+            self.fhd = io.BufferedReader(gzip.open(filename, mode='rb'),
+                                         buffer_size=READ_BUFFER_SIZE)
         else:
-            self.fhd = io.open(filename, mode='rb')  # binary mode! I don't expect unicode here!
+            # binary mode! I don't expect unicode here!
+            self.fhd = io.open(filename, mode='rb')
 
     @cython.ccall
     def sniff(self):
@@ -1089,7 +1095,8 @@ def sniff(self):
                 return True
             else:
                 self.fhd.seek(0)
-                raise Exception("File is not of a valid BAM format! %d" % tsize)
+                raise Exception("File is not of a valid BAM format! %d" %
+                                tsize)
         else:
             self.fhd.seek(0)
             return False
@@ -1189,7 +1196,8 @@ def build_fwtrack(self):
         rlengths: dict
 
         fwtrack = FWTrack(buffer_size=self.buffer_size)
-        references, rlengths = self.get_references()  # after this, ptr at list of alignments
+        # after this, ptr at list of alignments
+        references, rlengths = self.get_references()  
         # fseek = self.fhd.seek
         fread = self.fhd.read
         # ftell = self.fhd.tell
@@ -1248,7 +1256,9 @@ def append_fwtrack(self, fwtrack):
         info("%d reads have been read." % i)
         self.fhd.close()
         # fwtrack.finalize()
-        # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
+        # this is the problematic part. If fwtrack is finalized, then
+        # it's impossible to increase the length of it in a step of
+        # buffer_size for multiple input files.
         fwtrack.set_rlengths(rlengths)
         return fwtrack
 
@@ -1323,14 +1333,9 @@ def build_petrack(self):
             if i % 1000000 == 0:
                 info(" %d fragments parsed" % i)
 
-        # print(f"{references[chrid]:},{fpos:},{tlen:}")
         info("%d fragments have been read." % i)
-        # debug(f" {e1} Can't identify the length of entry, it may be the end of file, stop looping...")
-        # debug(f" {e2} Chromosome name can't be found which means this entry is skipped ...")
-        # assert i > 0, "Something went wrong, no fragment has been read! Check input file!"
         self.d = m / i
         self.n = i
-        # assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
         self.fhd.close()
         petrack.set_rlengths(rlengths)
         return petrack
@@ -1349,9 +1354,7 @@ def append_petrack(self, petrack):
         rlengths: dict
 
         references, rlengths = self.get_references()
-        # fseek = self.fhd.seek
         fread = self.fhd.read
-        # ftell = self.fhd.tell
 
         # for convenience, only count valid pairs
         add_loc = petrack.add_loc
@@ -1374,10 +1377,7 @@ def append_petrack(self, petrack):
         info("%d fragments have been read." % i)
         self.d = (self.d * self.n + m) / (self.n + i)
         self.n += i
-        # assert self.d >= 0, "Something went wrong (mean fragment size was negative: %d = %d / %d)" % (self.d, m, i)
         self.fhd.close()
-        # this is the problematic part. If fwtrack is finalized, then it's impossible to increase the length of it in a step of buffer_size for multiple input files.
-        # petrack.finalize()
         petrack.set_rlengths(rlengths)
         return petrack
 
@@ -1472,3 +1472,157 @@ def fw_parse_line(self, thisline: bytes) -> tuple:
                         1)
             else:
                 raise StrandFormatError(thisline, thisfields[1])
+
+
+@cython.cclass
+class FragParser(GenericParser):
+    """Parser for Fragment file containing scATAC-seq information.
+
+    Format:
+
+    chromosome frag_leftend frag_rightend barcode count
+
+    Note: Only the first five columns are used!
+
+    """
+    n = cython.declare(cython.int, visibility='public')
+    d = cython.declare(cython.float, visibility='public')
+
+    @cython.cfunc
+    def skip_first_commentlines(self):
+        """BEDPEParser needs to skip the first several comment lines.
+        """
+        l_line: cython.int
+        thisline: bytes
+
+        for thisline in self.fhd:
+            l_line = len(thisline)
+            if thisline and (thisline[:5] != b"track") \
+               and (thisline[:7] != b"browser") \
+               and (thisline[0] != 35):  # 35 is b"#"
+                break
+
+        # rewind from SEEK_CUR
+        self.fhd.seek(-l_line, 1)
+        return
+
+    @cython.cfunc
+    def pe_parse_line(self, thisline: bytes):
+        """Parse each line, and return chromosome, left and right
+        positions, barcode and count.
+
+        """
+        thisfields: list
+
+        thisline = thisline.rstrip()
+
+        # still only support tabular as delimiter.
+        thisfields = thisline.split(b'\t')
+        try:
+            return (thisfields[0],
+                    atoi(thisfields[1]),
+                    atoi(thisfields[2]),
+                    thisfields[3],
+                    atoi(thisfields[4]))
+        except IndexError:
+            raise Exception("Less than 5 columns found at this line: %s\n" %
+                            thisline)
+
+    @cython.ccall
+    def build_petrack2(self):
+        """Build PETrackII from all lines.
+
+        """
+        chromosome: bytes
+        left_pos: cython.int
+        right_pos: cython.int
+        barcode: bytes
+        count: cython.uchar
+        i: cython.long = 0          # number of fragments
+        m: cython.long = 0          # sum of fragment lengths
+        tmp: bytes = b""
+
+        petrack = PETrackII(buffer_size=self.buffer_size)
+        add_loc = petrack.add_loc
+
+        while True:
+            # for each block of input
+            tmp += self.fhd.read(READ_BUFFER_SIZE)
+            if not tmp:
+                break
+            lines = tmp.split(b"\n")
+            tmp = lines[-1]
+            for thisline in lines[:-1]:
+                (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
+                if left_pos < 0 or not chromosome:
+                    continue
+                assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
+                m += right_pos - left_pos
+                i += 1
+                if i % 1000000 == 0:
+                    info(" %d fragments parsed" % i)
+                add_loc(chromosome, left_pos, right_pos, barcode, count)
+        # last one
+        if tmp:
+            (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
+            if left_pos >= 0 and chromosome:
+                assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
+                i += 1
+                m += right_pos - left_pos
+                add_loc(chromosome, left_pos, right_pos, barcode, count)
+
+        self.d = cython.cast(cython.float, m) / i
+        self.n = i
+        assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
+
+        self.close()
+        petrack.set_rlengths({"DUMMYCHROM": 0})
+        return petrack
+
+    @cython.ccall
+    def append_petrack(self, petrack):
+        """Build PETrackI from all lines, return a PETrackI object.
+        """
+        chromosome: bytes
+        left_pos: cython.int
+        right_pos: cython.int
+        barcode: bytes
+        count: cython.uchar
+        i: cython.long = 0          # number of fragments
+        m: cython.long = 0          # sum of fragment lengths
+        tmp: bytes = b""
+
+        add_loc = petrack.add_loc
+        while True:
+            # for each block of input
+            tmp += self.fhd.read(READ_BUFFER_SIZE)
+            if not tmp:
+                break
+            lines = tmp.split(b"\n")
+            tmp = lines[-1]
+            for thisline in lines[:-1]:
+                (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
+                if left_pos < 0 or not chromosome:
+                    continue
+                assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
+                m += right_pos - left_pos
+                i += 1
+                if i % 1000000 == 0:
+                    info(" %d fragments parsed" % i)
+                add_loc(chromosome, left_pos, right_pos, barcode, count)
+        # last one
+        if tmp:
+            (chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
+            if left_pos >= 0 and chromosome:
+                assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
+                i += 1
+                m += right_pos - left_pos
+                add_loc(chromosome, left_pos, right_pos, barcode, count)
+
+        self.d = (self.d * self.n + m) / (self.n + i)
+        self.n += i
+
+        assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
+        self.close()
+        petrack.set_rlengths({"DUMMYCHROM": 0})
+        return petrack
diff --git a/MACS3/IO/PeakIO.pyx b/MACS3/IO/PeakIO.py
similarity index 58%
rename from MACS3/IO/PeakIO.pyx
rename to MACS3/IO/PeakIO.py
index e959db25..0ad8f36c 100644
--- a/MACS3/IO/PeakIO.pyx
+++ b/MACS3/IO/PeakIO.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-09-06 14:56:51 Tao Liu>
+# Time-stamp: <2024-10-15 11:48:33 Tao Liu>
 
 """Module for PeakIO IO classes.
 
@@ -15,32 +15,31 @@
 from itertools import groupby
 from operator import itemgetter
 import random
-import re
 import sys
 
 # ------------------------------------
 # MACS3 modules
 # ------------------------------------
 
-from MACS3.Utilities.Constants import *
+# from MACS3.Utilities.Constants import *
 
 # ------------------------------------
 # Other modules
 # ------------------------------------
-
-from cpython cimport bool
+import cython
+from cython.cimports.cpython import bool
 
 # ------------------------------------
 # constants
 # ------------------------------------
-__version__ = "PeakIO $Revision$"
-__author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
-__doc__ = "PeakIO class"
 
 # ------------------------------------
 # Misc functions
 # ------------------------------------
-cdef str subpeak_letters( int i):
+
+
+@cython.cfunc
+def subpeak_letters(i: cython.int) -> str:
     if i < 26:
         return chr(97+i)
     else:
@@ -50,24 +49,32 @@
 # Classes
 # ------------------------------------
 
-cdef class PeakContent:
-    cdef:
-        bytes chrom
-        int start
-        int end
-        int length
-        int summit
-        float score
-        float pileup
-        float pscore
-        float fc
-        float qscore
-        bytes name
-
-    def __init__ ( self, bytes chrom, int start, int end, int summit,
-                   float peak_score, float pileup,
-                   float pscore, float fold_change, float qscore,
-                   bytes name= b"NA" ):
+
+@cython.cclass
+class PeakContent:
+    chrom: bytes
+    start: cython.int
+    end: cython.int
+    length: cython.int
+    summit: cython.int
+    score: cython.float
+    pileup: cython.float
+    pscore: cython.float
+    fc: cython.float
+    qscore: cython.float
+    name: bytes
+
+    def __init__(self,
+                 chrom: bytes,
+                 start: cython.int,
+                 end: cython.int,
+                 summit: cython.int,
+                 peak_score: cython.float,
+                 pileup: cython.float,
+                 pscore: cython.float,
+                 fold_change: cython.float,
+                 qscore: cython.float,
+                 name: bytes = b""):
         self.chrom = chrom
         self.start = start
         self.end = end
@@ -80,7 +87,7 @@ def __init__ ( self, bytes chrom, int start, int end, int summit,
         self.qscore = qscore
         self.name = name
 
-    def __getitem__ ( self, a ):
+    def __getitem__(self, a: str):
         if a == "chrom":
             return self.chrom
         elif a == "start":
@@ -104,7 +111,7 @@ def __getitem__ ( self, a ):
         elif a == "name":
             return self.name
 
-    def __setitem__ ( self, a, v ):
+    def __setitem__(self, a: str, v):
         if a == "chrom":
             self.chrom = v
         elif a == "start":
@@ -128,187 +135,222 @@ def __setitem__ ( self, a, v ):
         elif a == "name":
             self.name = v
 
-    def __str__ (self):
-        return "chrom:%s;start:%d;end:%d;score:%f" % ( self.chrom, self.start, self.end, self.score )
-
-cdef class PeakIO:
+    def __str__(self):
+        return "chrom:%s;start:%d;end:%d;score:%f" % (self.chrom,
+                                                      self.start,
+                                                      self.end,
+                                                      self.score)
+
+    def __getstate__(self):
+        return (self.chrom,
+                self.start,
+                self.end,
+                self.length,
+                self.summit,
+                self.score,
+                self.pileup,
+                self.pscore,
+                self.fc,
+                self.qscore,
+                self.name)
+
+    def __setstate__(self, state):
+        (self.chrom, self.start, self.end, self.length, self.summit,
+         self.score, self.pileup, self.pscore, self.fc,
+         self.qscore, self.name) = state
+
+
+@cython.cclass
+class PeakIO:
     """IO for peak information.
 
     """
-    cdef:
-        public dict peaks       # dictionary storing peak contents
-        public bool CO_sorted   # whether peaks have been sorted by coordinations
-        public long total       # total number of peaks
-
-    def __init__ (self):
+    # dictionary storing peak contents
+    peaks = cython.declare(dict, visibility="public")
+    # whether peaks have been sorted by coordinations
+    CO_sorted = cython.declare(bool, visibility="public")
+    # total number of peaks
+    total = cython.declare(cython.long, visibility="public")
+
+    def __init__(self):
         self.peaks = {}
         self.CO_sorted = False
         self.total = 0
 
-    cpdef add (self, bytes chromosome, int start, int end, int summit = 0,
-               float peak_score = 0, float pileup = 0,
-               float pscore = 0, float fold_change = 0, float qscore = 0,
-               bytes name = b"NA"):
-        """items:
-        start:start
-        end:end,
-        length:end-start,
-        summit:summit,
-        score:peak_score,
-        pileup:pileup,
-        pscore:pscore,
-        fc:fold_change,
-        qscore:qscore
-        """
+    @cython.ccall
+    def add(self,
+            chromosome: bytes,
+            start: cython.int,  # leftmost position
+            end: cython.int,    # rightmost position
+            summit: cython.int = 0,  # summit position
+            peak_score: cython.float = 0,  # score
+            pileup: cython.float = 0,      # pileup value
+            pscore: cython.float = 0,      # -log10 pvalue
+            fold_change: cython.float = 0,  # fold change
+            qscore: cython.float = 0,      # -log10 qvalue
+            name: bytes = b""):            # peak name
         if not self.peaks.has_key(chromosome):
-            self.peaks[chromosome]=[]
-        self.peaks[chromosome].append(PeakContent( chromosome, start, end, summit, peak_score, pileup, pscore, fold_change, qscore, name))
+            self.peaks[chromosome] = []
+        self.peaks[chromosome].append(PeakContent(chromosome,
+                                                  start,
+                                                  end,
+                                                  summit,
+                                                  peak_score,
+                                                  pileup,
+                                                  pscore,
+                                                  fold_change,
+                                                  qscore,
+                                                  name))
         self.total += 1
         self.CO_sorted = False
 
-    cpdef add_PeakContent ( self, bytes chromosome, object peakcontent ):
+    @cython.ccall
+    def add_PeakContent(self,
+                        chromosome: bytes,
+                        peakcontent: PeakContent):
         if not self.peaks.has_key(chromosome):
-            self.peaks[chromosome]=[]
+            self.peaks[chromosome] = []
         self.peaks[chromosome].append(peakcontent)
         self.total += 1
         self.CO_sorted = False
 
-    cpdef list get_data_from_chrom (self, bytes chrom):
-        if not self.peaks.has_key( chrom ):
-            self.peaks[chrom]= []
+    @cython.ccall
+    def get_data_from_chrom(self, chrom: bytes) -> list:
+        if not self.peaks.has_key(chrom):
+            self.peaks[chrom] = []
         return self.peaks[chrom]
 
-    cpdef set get_chr_names (self):
-        return set(sorted(self.peaks.keys()))
+    def get_chr_names(self) -> set:
+        return set(self.peaks.keys())
+
+    def sort(self):
+        chrs: list
+        chrom: bytes
 
-    def sort ( self ):
-        cdef:
-            list chrs
-            bytes chrom
         # sort by position
         if self.CO_sorted:
             # if already sorted, quit
             return
         chrs = sorted(list(self.peaks.keys()))
         for chrom in sorted(chrs):
-            self.peaks[chrom].sort(key=lambda x:x['start'])
+            self.peaks[chrom].sort(key=lambda x: x['start'])
         self.CO_sorted = True
         return
 
-    cpdef object randomly_pick ( self, int n, int seed = 12345 ):
+    @cython.ccall
+    def randomly_pick(self, n: cython.int, seed: cython.int = 12345):
         """Shuffle the peaks and get n peaks out of it. Return a new
         PeakIO object.
         """
-        cdef:
-            list all_pc
-            list chrs
-            bytes chrom
-            object ret_peakio, p
+        all_pc: list
+        chrs: list
+        chrom: bytes
+        ret_peakio: PeakIO
+        p: PeakContent
+
         assert n > 0
         chrs = sorted(list(self.peaks.keys()))
         all_pc = []
         for chrom in sorted(chrs):
             all_pc.extend(self.peaks[chrom])
-        random.seed( seed )
-        random.shuffle( all_pc )
+        random.seed(seed)
+        random.shuffle(all_pc)
         all_pc = all_pc[:n]
         ret_peakio = PeakIO()
         for p in all_pc:
-            ret_peakio.add_PeakContent ( p["chrom"], p )
+            ret_peakio.add_PeakContent(p["chrom"], p)
         return ret_peakio
-    
-    cpdef void filter_pscore (self, double pscore_cut ):
-        cdef:
-            bytes chrom
-            dict new_peaks
-            list chrs
-            object p
+
+    @cython.ccall
+    def filter_pscore(self, pscore_cut: cython.double):
+        chrom: bytes
+        new_peaks: dict
+        chrs: list
+
         new_peaks = {}
         chrs = sorted(list(self.peaks.keys()))
         self.total = 0
         for chrom in sorted(chrs):
-            new_peaks[chrom]=[p for p in self.peaks[chrom] if p['pscore'] >= pscore_cut]
-            self.total +=  len( new_peaks[chrom] )
+            new_peaks[chrom] = [p for p in self.peaks[chrom] if p['pscore'] >= pscore_cut]
+            self.total += len(new_peaks[chrom])
         self.peaks = new_peaks
         self.CO_sorted = True
         self.sort()
 
-    cpdef void filter_qscore (self, double qscore_cut ):
-        cdef:
-            bytes chrom
-            dict new_peaks
-            list chrs
-            object p
+    @cython.ccall
+    def filter_qscore(self, qscore_cut: cython.double):
+        chrom: bytes
+        new_peaks: dict
+        chrs: list
 
         new_peaks = {}
         chrs = sorted(list(self.peaks.keys()))
         self.total = 0
         for chrom in sorted(chrs):
-            new_peaks[chrom]=[p for p in self.peaks[chrom] if p['qscore'] >= qscore_cut]
-            self.total +=  len( new_peaks[chrom] )
+            new_peaks[chrom] = [p for p in self.peaks[chrom] if p['qscore'] >= qscore_cut]
+            self.total += len(new_peaks[chrom])
         self.peaks = new_peaks
         self.CO_sorted = True
         self.sort()
 
-    cpdef void filter_fc (self, float fc_low, float fc_up = 0 ):
+    @cython.ccall
+    def filter_fc(self, fc_low: cython.float, fc_up: cython.float = 0):
         """Filter peaks in a given fc range.
 
         If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
 
         """
-        cdef:
-            bytes chrom
-            dict new_peaks
-            list chrs
-            object p
+        chrom: bytes
+        new_peaks: dict
+        chrs: list
 
         new_peaks = {}
         chrs = list(self.peaks.keys())
         self.total = 0
         if fc_up > 0 and fc_up > fc_low:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in self.peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
-                self.total +=  len( new_peaks[chrom] )
+                new_peaks[chrom] = [p for p in self.peaks[chrom] if p['fc'] >= fc_low and p['fc'] < fc_up]
+                self.total += len(new_peaks[chrom])
         else:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in self.peaks[chrom] if p['fc'] >= fc_low]
-                self.total +=  len( new_peaks[chrom] )
+                new_peaks[chrom] = [p for p in self.peaks[chrom] if p['fc'] >= fc_low]
+                self.total += len(new_peaks[chrom])
         self.peaks = new_peaks
         self.CO_sorted = True
         self.sort()
 
-    cpdef void filter_score (self, float lower_score, float upper_score = 0 ):
+    def filter_score(self, lower_score: cython.float, upper_score: cython.float = 0):
         """Filter peaks in a given score range.
 
         """
-        cdef:
-            bytes chrom
-            dict new_peaks
-            list chrs
-            object p
+        chrom: bytes
+        new_peaks: dict
+        chrs: list
 
         new_peaks = {}
         chrs = list(self.peaks.keys())
         self.total = 0
         if upper_score > 0 and upper_score > lower_score:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in self.peaks[chrom] if p['score'] >= lower_score and p['score']<upper_score]
-                self.total +=  len( new_peaks[chrom] )
+                new_peaks[chrom] = [p for p in self.peaks[chrom] if p['score'] >= lower_score and p['score'] < upper_score]
+                self.total += len(new_peaks[chrom])
         else:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in self.peaks[chrom] if p['score'] >= lower_score]
-                self.total +=  len( new_peaks[chrom] )
+                new_peaks[chrom] = [p for p in self.peaks[chrom] if p['score'] >= lower_score]
+                self.total += len(new_peaks[chrom])
         self.peaks = new_peaks
         self.CO_sorted = True
         self.sort()
 
-    def __str__ (self):
+    def __str__(self):
         """convert to text -- for debug
         """
-        cdef:
-            list chrs
-            int n_peak
-            str ret            
+        chrs: list
+        n_peak: cython.int
+        ret: str
+        chrom: bytes
+        peaks: list
+
         ret = ""
         chrs = list(self.peaks.keys())
         n_peak = 0
@@ -318,38 +360,44 @@ def __str__ (self):
                 peaks = list(group)
                 if len(peaks) > 1:
                     for i, peak in enumerate(peaks):
-                        ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d%s\tscore:%.6g\tsummit:%d\n" % (chrom.decode(),peak['start'],peak['end'],n_peak,subpeak_letters(i),peak["score"],peak["summit"])
+                        ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d%s\tscore:%.6g\tsummit:%d\n" % (chrom.decode(), peak['start'], peak['end'], n_peak, subpeak_letters(i), peak["score"], peak["summit"])
                 else:
                     peak = peaks[0]
-                    ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d\tscore:%.6g\tsummit:%d\n" % (chrom.decode(),peak['start'],peak['end'],n_peak,peak["score"],peak["summit"])                    
-                    
+                    ret += "chrom:%s\tstart:%d\tend:%d\tname:peak_%d\tscore:%.6g\tsummit:%d\n" % (chrom.decode(), peak['start'], peak['end'], n_peak, peak["score"], peak["summit"])
         return ret
 
-    cdef void _to_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
-                      bytes description=b"%s", str score_column="score",
-                      bool trackline=False, print_func=sys.stdout.write):
+    @cython.cfunc
+    def _to_bed(self,
+                name_prefix: bytes = b"%s_peak_",
+                name: bytes = b"MACS",
+                description: bytes = b"%s",
+                score_column: str = "score",
+                trackline: bool = False,
+                print_func=sys.stdout.write):
         """
         generalization of tobed and write_to_bed
         """
-        cdef:
-            list chrs
-            int n_peak
-            bytes peakprefix, desc
+        chrs: list
+        n_peak: cython.int
+        peakprefix: bytes
+        desc: bytes
+
         chrs = list(self.peaks.keys())
         n_peak = 0
         try:
             peakprefix = name_prefix % name
-        except:
+        except Exception:
             peakprefix = name_prefix
         try:
             desc = description % name
-        except:
+        except Exception:
             desc = description
+
         if trackline:
             try:
-                print_func('track name="%s (peaks)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),
-                                                                                         desc.replace(b"\"", b"\\\"").decode() ) )
-            except:
+                print_func('track name="%s (peaks)" description="%s" visibility=1\n' % (name.replace(b"\"", b"\\\"").decode(),
+                                                                                        desc.replace(b"\"", b"\\\"").decode()))
+            except Exception:
                 print_func('track name=MACS description=Unknown\n')
         for chrom in sorted(chrs):
             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
@@ -357,27 +405,43 @@ def __str__ (self):
                 peaks = list(group)
                 if len(peaks) > 1:
                     for i, peak in enumerate(peaks):
-                        print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
+                        print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(), peak['start'], peak['end'], peakprefix.decode(), n_peak, subpeak_letters(i), peak[score_column]))
                 else:
                     peak = peaks[0]
-                    print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),peak['start'],peak['end'],peakprefix.decode(),n_peak,peak[score_column]))
-
-    cdef _to_summits_bed(self, bytes name_prefix=b"%s_peak_", bytes name=b"MACS",
-                        bytes description = b"%s", str score_column="score",
-                        bool trackline=False, print_func=sys.stdout.write):
+                    print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(), peak['start'], peak['end'], peakprefix.decode(), n_peak, peak[score_column]))
+
+    @cython.cfunc
+    def _to_summits_bed(self,
+                        name_prefix: bytes = b"%s_peak_",
+                        name: bytes = b"MACS",
+                        description: bytes = b"%s",
+                        score_column: str = "score",
+                        trackline: bool = False,
+                        print_func=sys.stdout.write):
         """
         generalization of to_summits_bed and write_to_summit_bed
         """
+        chrs: list
+        n_peak: cython.int
+        peakprefix: bytes
+        desc: bytes
+
         chrs = list(self.peaks.keys())
         n_peak = 0
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
-        try: desc = description % name
-        except: desc = description
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
+        try:
+            desc = description % name
+        except Exception:
+            desc = description
         if trackline:
-            try: print_func('track name="%s (summits)" description="%s" visibility=1\n' % ( name.replace(b"\"", b"\\\"").decode(),\
-                                                                                            desc.replace(b"\"", b"\\\"").decode() ) )
-            except: print_func('track name=MACS description=Unknown')
+            try:
+                print_func('track name="%s (summits)" description="%s" visibility=1\n' % (name.replace(b"\"", b"\\\"").decode(),
+                                                                                          desc.replace(b"\"", b"\\\"").decode()))
+            except Exception:
+                print_func('track name=MACS description=Unknown')
         for chrom in sorted(chrs):
             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
                 n_peak += 1
@@ -385,14 +449,14 @@ def __str__ (self):
                 if len(peaks) > 1:
                     for i, peak in enumerate(peaks):
                         summit_p = peak['summit']
-                        print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,subpeak_letters(i),peak[score_column]))
+                        print_func("%s\t%d\t%d\t%s%d%s\t%.6g\n" % (chrom.decode(), summit_p, summit_p+1, peakprefix.decode(), n_peak, subpeak_letters(i), peak[score_column]))
                 else:
                     peak = peaks[0]
                     summit_p = peak['summit']
-                    print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(),summit_p,summit_p+1,peakprefix.decode(),n_peak,peak[score_column]))
+                    print_func("%s\t%d\t%d\t%s%d\t%.6g\n" % (chrom.decode(), summit_p, summit_p+1, peakprefix.decode(), n_peak, peak[score_column]))
 
-    def tobed (self):
-        """Print out peaks in BED5 format.
+    def tobed(self):
+        """Print out (stdout) peaks in BED5 format.
 
         Five columns are chromosome, peak start, peak end, peak name, and peak height.
 
@@ -406,19 +470,23 @@ def tobed (self):
         fc:fold_change,
         qscore:qvalue
         """
-        return self._to_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
+        return self._to_bed(name_prefix=b"%s_peak_", score_column="score", name=self.name, description=b"")
 
-    def to_summits_bed (self):
-        """Print out peak summits in BED5 format.
+    def to_summits_bed(self):
+        """Print out (stdout) peak summits in BED5 format.
 
         Five columns are chromosome, summit start, summit end, peak name, and peak height.
 
         """
-        return self._to_summits_bed(name_prefix=b"peak_", score_column="score", name=b"", description=b"")
+        return self._to_summits_bed(name_prefix=b"%s_peak_", score_column="score", name=self.name, description=b"")
 
     # these methods are very fast, specifying types is unnecessary
-    def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS",
-                        bytes description = b"%s", str score_column="score", trackline=True):
+    def write_to_bed(self, fhd,
+                     name_prefix: bytes = b"%s_peak_",
+                     name: bytes = b"MACS",
+                     description: bytes = b"%s",
+                     score_column: str = "score",
+                     trackline: bool = True):
         """Write peaks in BED5 format in a file handler. Score (5th
         column) is decided by score_column setting. Check the
         following list. Name column ( 4th column) is made by putting
@@ -439,13 +507,20 @@ def write_to_bed (self, fhd, bytes name_prefix=b"peak_", bytes name=b"MACS",
         fc:fold_change,
         qscore:qvalue
         """
-        #print(description)
-        return self._to_bed(name_prefix=name_prefix, name=name,
-                            description=description, score_column=score_column,
-                            print_func=fhd.write, trackline=trackline)
-
-    def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b"MACS",
-                             bytes description = b"%s", str score_column ="score", trackline=True):
+        # print(description)
+        return self._to_bed(name_prefix=name_prefix,
+                            name=name,
+                            description=description,
+                            score_column=score_column,
+                            print_func=fhd.write,
+                            trackline=trackline)
+
+    def write_to_summit_bed(self, fhd,
+                            name_prefix: bytes = b"%s_peak_",
+                            name: bytes = b"MACS",
+                            description: bytes = b"%s",
+                            score_column: str = "score",
+                            trackline: bool = False):
         """Write peak summits in BED5 format in a file handler. Score
         (5th column) is decided by score_column setting. Check the
         following list. Name column ( 4th column) is made by putting
@@ -469,7 +544,11 @@ def write_to_summit_bed (self, fhd, bytes name_prefix = b"peak_", bytes name = b
                                     description=description, score_column=score_column,
                                     print_func=fhd.write, trackline=trackline)
 
-    def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b"peak", str score_column="score", trackline=True):
+    def write_to_narrowPeak(self, fhd,
+                            name_prefix: bytes = b"%s_peak_",
+                            name: bytes = b"MACS",
+                            score_column: str = "score",
+                            trackline: bool = False):
         """Print out peaks in narrowPeak format.
 
         This format is designed for ENCODE project, and basically a
@@ -523,33 +602,41 @@ def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b
         +-----------+------+----------------------------------------+
 
         """
-        cdef int n_peak
-        cdef bytes chrom
-        cdef long s
-        cdef str peakname
+        n_peak: cython.int
+        chrom: bytes
+        s: cython.long
+        peakname: str
 
         chrs = list(self.peaks.keys())
         n_peak = 0
         write = fhd.write
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
         if trackline:
             write("track type=narrowPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
         for chrom in sorted(chrs):
             for end, group in groupby(self.peaks[chrom], key=itemgetter("end")):
                 n_peak += 1
                 these_peaks = list(group)
-                if len(these_peaks) > 1: # from call-summits
+                if len(these_peaks) > 1:  # from call-summits
                     for i, peak in enumerate(these_peaks):
                         peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
                         if peak['summit'] == -1:
                             s = -1
                         else:
                             s = peak['summit'] - peak['start']
-                        fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
-                                   %
-                                   (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
-                                    peak['fc'],peak['pscore'],peak['qscore'],s) )
+                        fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" %
+                                  (chrom.decode(),
+                                   peak['start'],
+                                   peak['end'],
+                                   peakname,
+                                   int(10*peak[score_column]),
+                                   peak['fc'],
+                                   peak['pscore'],
+                                   peak['qscore'],
+                                   s))
                 else:
                     peak = these_peaks[0]
                     peakname = "%s%d" % (peakprefix.decode(), n_peak)
@@ -557,13 +644,22 @@ def write_to_narrowPeak (self, fhd, bytes name_prefix = b"peak_", bytes name = b
                         s = -1
                     else:
                         s = peak['summit'] - peak['start']
-                    fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n"
-                               %
-                               (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
-                                peak['fc'],peak['pscore'],peak['qscore'],s) )
+                    fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\t%d\n" %
+                              (chrom.decode(),
+                               peak['start'],
+                               peak['end'],
+                               peakname,
+                               int(10*peak[score_column]),
+                               peak['fc'],
+                               peak['pscore'],
+                               peak['qscore'],
+                               s))
         return
 
-    def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MACS"):
+    @cython.ccall
+    def write_to_xls(self, ofhd,
+                     name_prefix: bytes = b"%s_peak_",
+                     name: bytes = b"MACS"):
         """Save the peak results in a tab-delimited plain text file
         with suffix .xls.
 
@@ -571,11 +667,19 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA
         wait... why I have two write_to_xls in this class?
 
         """
+        peakprefix: bytes
+        chrs: list
+        these_peaks: list
+        n_peak: cython.int
+        i: cython.int
+
         write = ofhd.write
-        write("\t".join(("chr","start", "end",  "length",  "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
+        write("\t".join(("chr", "start", "end",  "length",  "abs_summit", "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
 
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
 
         peaks = self.peaks
         chrs = list(peaks.keys())
@@ -587,47 +691,56 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA
                 if len(these_peaks) > 1:
                     for i, peak in enumerate(these_peaks):
                         peakname = "%s%d%s" % (peakprefix.decode(), n_peak, subpeak_letters(i))
-                        #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
-                        write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
-                        write("\t%d" % (peak['summit']+1)) # summit position
-                        write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
-                        write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
-                        write("\t%.6g" % (peak['fc'])) # fold change at summit
-                        write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
+                        # [start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
+                        write("%s\t%d\t%d\t%d" % (chrom.decode(),
+                                                  peak['start']+1,
+                                                  peak['end'],
+                                                  peak['length']))
+                        write("\t%d" % (peak['summit']+1))  # summit position
+                        write("\t%.6g" % (round(peak['pileup'], 2)))  # pileup height at summit
+                        write("\t%.6g" % (peak['pscore']))  # -log10pvalue at summit
+                        write("\t%.6g" % (peak['fc']))  # fold change at summit
+                        write("\t%.6g" % (peak['qscore']))  # -log10qvalue at summit
                         write("\t%s" % peakname)
                         write("\n")
                 else:
                     peak = these_peaks[0]
                     peakname = "%s%d" % (peakprefix.decode(), n_peak)
-                    #[start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
-                    write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
-                    write("\t%d" % (peak['summit']+1)) # summit position
-                    write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
-                    write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
-                    write("\t%.6g" % (peak['fc'])) # fold change at summit
-                    write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
+                    # [start,end,end-start,summit,peak_height,number_tags,pvalue,fold_change,qvalue]
+                    write("%s\t%d\t%d\t%d" % (chrom.decode(),
+                                              peak['start']+1,
+                                              peak['end'],
+                                              peak['length']))
+                    write("\t%d" % (peak['summit']+1))  # summit position
+                    write("\t%.6g" % (round(peak['pileup'], 2)))  # pileup height at summit
+                    write("\t%.6g" % (peak['pscore']))  # -log10pvalue at summit
+                    write("\t%.6g" % (peak['fc']))  # fold change at summit
+                    write("\t%.6g" % (peak['qscore']))  # -log10qvalue at summit
                     write("\t%s" % peakname)
                     write("\n")
         return
 
-
-    cpdef void exclude (self, object peaksio2):
+    @cython.ccall
+    def exclude(self, peaksio2: object):
         """ Remove overlapping peaks in peaksio2, another PeakIO object.
 
         """
-        cdef:
-            dict peaks1, peaks2
-            list chrs1, chrs2
-            bytes k
-            dict ret_peaks
-            bool overlap_found
-            object r1, r2       # PeakContent objects
-            long n_rl1, n_rl2
+        peaks1: dict
+        peaks2: dict
+        chrs1: list
+        chrs2: list
+        k: bytes
+        ret_peaks: dict
+        overlap_found: bool
+        r1: PeakContent
+        r2: PeakContent
+        n_rl1: cython.long
+        n_rl2: cython.long
 
         self.sort()
         peaks1 = self.peaks
         self.total = 0
-        assert isinstance(peaksio2,PeakIO)
+        assert isinstance(peaksio2, PeakIO)
         peaksio2.sort()
         peaks2 = peaksio2.peaks
 
@@ -638,44 +751,44 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA
             #print(f"chromosome {k}")
             if not chrs2.count(k):
                 # no such chromosome in peaks1, then don't touch the peaks in this chromosome
-                ret_peaks[ k ] = peaks1[ k ]
+                ret_peaks[k] = peaks1[k]
                 continue
-            ret_peaks[ k ] = []
-            n_rl1 = len( peaks1[k] )
-            n_rl2 = len( peaks2[k] )
-            rl1_k = iter( peaks1[k] ).__next__
-            rl2_k = iter( peaks2[k] ).__next__
+            ret_peaks[k] = []
+            n_rl1 = len(peaks1[k])
+            n_rl2 = len(peaks2[k])
+            rl1_k = iter(peaks1[k]).__next__
+            rl2_k = iter(peaks2[k]).__next__
             overlap_found = False
             r1 = rl1_k()
             n_rl1 -= 1
             r2 = rl2_k()
             n_rl2 -= 1
-            while ( True ):
+            while (True):
                 # we do this until there is no r1 or r2 left.
                 if r2["start"] < r1["end"] and r1["start"] < r2["end"]:
                     # since we found an overlap, r1 will be skipped/excluded
                     # and move to the next r1
                     overlap_found = True
-                    #print(f"found overlap of {r1['start']} {r1['end']} and {r2['start']} {r2['end']}, move to the next r1")
+                    # print(f"found overlap of {r1['start']} {r1['end']} and {r2['start']} {r2['end']}, move to the next r1")
                     n_rl1 -= 1
                     if n_rl1 >= 0:
                         r1 = rl1_k()
-                        #print(f"move to next r1 {r1['start']} {r1['end']}")
+                        # print(f"move to next r1 {r1['start']} {r1['end']}")
                         overlap_found = False
                         continue
                     else:
                         break
                 if r1["end"] < r2["end"]:
-                    #print(f"now we need to move r1 {r1['start']} {r1['end']}")
+                    # print(f"now we need to move r1 {r1['start']} {r1['end']}")
                     # in this case, we need to move to the next r1,
                     # we will check if overlap_found is true, if not, we put r1 in a new dict
                     if not overlap_found:
-                        #print(f"we add this r1 {r1['start']} {r1['end']} to list")
-                        ret_peaks[ k ].append( r1 )
+                        # print(f"we add this r1 {r1['start']} {r1['end']} to list")
+                        ret_peaks[k].append(r1)
                     n_rl1 -= 1
                     if n_rl1 >= 0:
                         r1 = rl1_k()
-                        #print(f"move to next r1 {r1['start']} {r1['end']}")
+                        # print(f"move to next r1 {r1['start']} {r1['end']}")
                         overlap_found = False
                     else:
                         # no more r1 left
@@ -685,54 +798,61 @@ def write_to_xls (self, ofhd, bytes name_prefix = b"%s_peak_", bytes name = b"MA
                     if n_rl2:
                         r2 = rl2_k()
                         n_rl2 -= 1
-                        #print(f"move to next r2 {r2['start']} {r2['end']}")                      
+                        # print(f"move to next r2 {r2['start']} {r2['end']}")
                     else:
                         # no more r2 left
                         break
             # add the rest of r1
-            #print( f"n_rl1: {n_rl1} n_rl2:{n_rl2} last overlap_found is {overlap_found}" )
-            #if overlap_found:
+            # print( f"n_rl1: {n_rl1} n_rl2:{n_rl2} last overlap_found is {overlap_found}" )
+            # if overlap_found:
             #    n_rl1 -= 1
             if n_rl1 >= 0:
-                ret_peaks[ k ].extend( peaks1[ k ][-n_rl1-1:] )
+                ret_peaks[k].extend(peaks1[k][-n_rl1-1:])
 
         for k in ret_peaks.keys():
-            self.total += len( ret_peaks[ k ] )
+            self.total += len(ret_peaks[k])
 
         self.peaks = ret_peaks
         self.CO_sorted = True
-        self.sort()        
+        self.sort()
         return
 
-    def read_from_xls (self, ofhd):
+    @cython.ccall
+    def read_from_xls(self, ofhd):
         """Save the peak results in a tab-delimited plain text file
         with suffix .xls.
 
         """
-        cdef:
-            bytes line = b''
-            bytes chrom = b''
-            int n_peak = 0
-            int start, end, length, summit
-            float pileup, pscore, fc, qscore
-            list fields
+        line: bytes = b''
+        chrom: bytes = b''
+        start: cython.int
+        end: cython.int
+        length: cython.int
+        summit: cython.int
+        pileup: cython.float
+        pscore: cython.float
+        fc: cython.float
+        qscore: cython.float
+        fields: list
+
         while True:
-            if not (line.startswith('#') or line.strip() == ''): break
+            if not (line.startswith('#') or line.strip() == ''):
+                break
             line = ofhd.readline()
 
         # sanity check
         columns = line.rstrip().split('\t')
-        for a,b in zip(columns, ("chr","start", "end",  "length", "abs_summit",
-                                 "pileup", "-log10(pvalue)", "fold_enrichment",
-                                 "-log10(qvalue)", "name")):
-            if not a==b: raise NotImplementedError('column %s not recognized', a)
+        for a, b in zip(columns, ("chr", "start", "end",  "length", "abs_summit",
+                                  "pileup", "-log10(pvalue)", "fold_enrichment",
+                                  "-log10(qvalue)", "name")):
+            if not a == b:
+                raise NotImplementedError('column %s not recognized', a)
 
         add = self.add
         split = str.split
         rstrip = str.rstrip
         for i, line in enumerate(ofhd.readlines()):
             fields = split(line, '\t')
-            peak = {}
             chrom = fields[0].encode()
             start = int(fields[1]) - 1
             end = int(fields[2])
@@ -748,68 +868,62 @@ def read_from_xls (self, ofhd):
             add(chrom, start, end, summit, qscore, pileup, pscore, fc, qscore,
                 peakname)
 
-cpdef parse_peakname(peakname):
-    """returns peaknumber, subpeak
-    """
-    cdef:
-        bytes peak_id, peaknumber, subpeak
-    peak_id = peakname.split(b'_')[-1]
-    x = re.split('(\D.*)', peak_id)
-    peaknumber = int(x[0])
-    try:
-        subpeak = x[1]
-    except IndexError:
-        subpeak = b''
-    return (peaknumber, subpeak)
-
-cdef class RegionIO:
+
+@cython.cclass
+class RegionIO:
     """For plain region of chrom, start and end
     """
-    cdef:
-        dict regions
-        bool __flag_sorted
+    regions: dict
+    __flag_sorted: bool
 
-    def __init__ (self):
-        self.regions= {}
+    def __init__(self):
+        self.regions = {}
         self.__flag_sorted = False
 
-    cpdef void add_loc ( self, bytes chrom, int start, int end ):
+    @cython.ccall
+    def add_loc(self, chrom: bytes, start: cython.int, end: cython.int):
         if self.regions.has_key(chrom):
-            self.regions[chrom].append( (start,end) )
+            self.regions[chrom].append((start, end))
         else:
-            self.regions[chrom] = [(start,end), ]
+            self.regions[chrom] = [(start, end), ]
         self.__flag_sorted = False
         return
 
-    cpdef void sort (self):
-        cdef bytes chrom
+    @cython.ccall
+    def sort(self):
+        chrom: bytes
 
         for chrom in sorted(list(self.regions.keys())):
             self.regions[chrom].sort()
         self.__flag_sorted = True
 
-    cpdef set get_chr_names (self):
+    @cython.ccall
+    def get_chr_names(self) -> set:
         return set(sorted(self.regions.keys()))
 
-    cpdef void merge_overlap ( self ):
+    @cython.ccall
+    def merge_overlap(self):
         """
         merge overlapping regions
         """
-        cdef:
-            bytes chrom
-            int s_new_region, e_new_region, i, j
-            dict regions, new_regions
-            list chrs, regions_chr
-            tuple prev_region
+        chrom: bytes
+        s_new_region: cython.int
+        e_new_region: cython.int
+        i: cython.int
+        regions: dict
+        new_regions: dict
+        chrs: list
+        regions_chr: list
+        prev_region: tuple
 
         if not self.__flag_sorted:
             self.sort()
         regions = self.regions
         new_regions = {}
-        chrs = sorted( list( regions.keys() ) )
-        for i in range( len( chrs ) ):
+        chrs = sorted(list(regions.keys()))
+        for i in range(len(chrs)):
             chrom = chrs[i]
-            new_regions[chrom]=[]
+            new_regions[chrom] = []
             n_append = new_regions[chrom].append
             prev_region = None
             regions_chr = regions[chrom]
@@ -821,7 +935,7 @@ def __init__ (self):
                     if regions_chr[i][0] <= prev_region[1]:
                         s_new_region = prev_region[0]
                         e_new_region = regions_chr[i][1]
-                        prev_region = (s_new_region,e_new_region)
+                        prev_region = (s_new_region, e_new_region)
                     else:
                         n_append(prev_region)
                         prev_region = regions_chr[i]
@@ -831,43 +945,53 @@ def __init__ (self):
         self.sort()
         return
 
-    cpdef write_to_bed (self, fhd ):
-        cdef:
-            int i
-            bytes chrom
-            list chrs
-            tuple region
+    @cython.ccall
+    def write_to_bed(self, fhd):
+        i: cython.int
+        chrom: bytes
+        chrs: list
+        region: tuple
 
         chrs = sorted(list(self.regions.keys()))
-        for i in range( len(chrs) ):
+        for i in range(len(chrs)):
             chrom = chrs[i]
             for region in self.regions[chrom]:
-                fhd.write( "%s\t%d\t%d\n" % (chrom.decode(),region[0],region[1] ) )
-
-
-cdef class BroadPeakContent:
-    cdef:
-        long start
-        long end
-        long length
-        float score
-        bytes thickStart
-        bytes thickEnd
-        long blockNum
-        bytes  blockSizes
-        bytes  blockStarts
-        float pileup
-        float pscore
-        float fc
-        float qscore
-        bytes name
-
-    def __init__ ( self, long start, long end, float score,
-                   bytes thickStart, bytes thickEnd,
-                   long blockNum, bytes blockSizes,
-                   bytes blockStarts, float pileup,
-                   float pscore, float fold_change,
-                   float qscore, bytes name = b"NA" ):
+                fhd.write("%s\t%d\t%d\n" % (chrom.decode(),
+                                            region[0],
+                                            region[1]))
+
+
+@cython.cclass
+class BroadPeakContent:
+    start: cython.int
+    end: cython.int
+    length: cython.int
+    score: cython.float
+    thickStart: bytes
+    thickEnd: bytes
+    blockNum: cython.int
+    blockSizes: bytes
+    blockStarts: bytes
+    pileup: cython.float
+    pscore: cython.float
+    fc: cython.float
+    qscore: cython.float
+    name: bytes
+
+    def __init__(self,
+                 start: cython.int,
+                 end: cython.int,
+                 score: cython.float,
+                 thickStart: bytes,
+                 thickEnd: bytes,
+                 blockNum: cython.int,
+                 blockSizes: bytes,
+                 blockStarts: bytes,
+                 pileup: cython.float,
+                 pscore: cython.float,
+                 fold_change: cython.float,
+                 qscore: cython.float,
+                 name: bytes = b"NA"):
         self.start = start
         self.end = end
         self.score = score
@@ -876,7 +1000,6 @@ def __init__ ( self, long start, long end, float score,
         self.blockNum = blockNum
         self.blockSizes = blockSizes
         self.blockStarts = blockStarts
-
         self.length = end - start
         self.pileup = pileup
         self.pscore = pscore
@@ -884,7 +1007,7 @@ def __init__ ( self, long start, long end, float score,
         self.qscore = qscore
         self.name = name
 
-    def __getitem__ ( self, a ):
+    def __getitem__(self, a):
         if a == "start":
             return self.start
         elif a == "end":
@@ -914,26 +1037,36 @@ def __getitem__ ( self, a ):
         elif a == "name":
             return self.name
 
-    def __str__ (self):
-        return "start:%d;end:%d;score:%f" % ( self.start, self.end, self.score )
+    def __str__(self):
+        return "start:%d;end:%d;score:%f" % (self.start, self.end, self.score)
 
 
-cdef class BroadPeakIO:
+@cython.cclass
+class BroadPeakIO:
     """IO for broad peak information.
 
     """
-    cdef:
-        dict peaks
+    peaks: dict
 
-    def __init__ (self):
+    def __init__(self):
         self.peaks = {}
 
-    def add (self, char * chromosome, long start, long end, long score = 0,
-             bytes thickStart=b".", bytes thickEnd=b".",
-             long blockNum=0, bytes blockSizes=b".",
-             bytes blockStarts=b".", float pileup = 0,
-             float pscore = 0, float fold_change = 0,
-             float qscore = 0, bytes name = b"NA" ):
+    @cython.ccall
+    def add(self,
+            chromosome: bytes,
+            start: cython.int,
+            end: cython.int,
+            score: cython.float = 0.0,
+            thickStart: bytes = b".",
+            thickEnd: bytes = b".",
+            blockNum: cython.int = 0,
+            blockSizes: bytes = b".",
+            blockStarts: bytes = b".",
+            pileup: cython.float = 0,
+            pscore: cython.float = 0,
+            fold_change: cython.float = 0,
+            qscore: cython.float = 0,
+            name: bytes = b"NA"):
         """items
         chromosome : chromosome name,
         start      : broad region start,
@@ -952,81 +1085,97 @@ def add (self, char * chromosome, long start, long end, long score = 0,
         """
         if not self.peaks.has_key(chromosome):
             self.peaks[chromosome] = []
-        self.peaks[chromosome].append( BroadPeakContent( start, end, score, thickStart, thickEnd,
-                                                         blockNum, blockSizes, blockStarts,
-                                                         pileup, pscore, fold_change, qscore, name ) )
-
-    def filter_pscore (self, double pscore_cut ):
-        cdef:
-            bytes chrom
-            dict peaks
-            dict new_peaks
-            list chrs
-            BroadPeakContent p
+        self.peaks[chromosome].append(BroadPeakContent(start,
+                                                       end,
+                                                       score,
+                                                       thickStart,
+                                                       thickEnd,
+                                                       blockNum,
+                                                       blockSizes,
+                                                       blockStarts,
+                                                       pileup,
+                                                       pscore,
+                                                       fold_change,
+                                                       qscore,
+                                                       name))
+
+    @cython.ccall
+    def filter_pscore(self, pscore_cut: cython.float):
+        chrom: bytes
+        peaks: dict
+        new_peaks: dict
+        chrs: list
 
         peaks = self.peaks
         new_peaks = {}
         chrs = list(peaks.keys())
 
         for chrom in sorted(chrs):
-            new_peaks[chrom]=[p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
+            new_peaks[chrom] = [p for p in peaks[chrom] if p['pscore'] >= pscore_cut]
         self.peaks = new_peaks
 
-    def filter_qscore (self, double qscore_cut ):
-        cdef:
-            bytes chrom
-            dict peaks
-            dict new_peaks
-            list chrs
-            BroadPeakContent p
+    @cython.ccall
+    def filter_qscore(self, qscore_cut: cython.float):
+        chrom: bytes
+        peaks: dict
+        new_peaks: dict
+        chrs: list
 
         peaks = self.peaks
         new_peaks = {}
         chrs = list(peaks.keys())
 
         for chrom in sorted(chrs):
-            new_peaks[chrom]=[p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
+            new_peaks[chrom] = [p for p in peaks[chrom] if p['qscore'] >= qscore_cut]
         self.peaks = new_peaks
 
-    def filter_fc (self, fc_low, fc_up=None ):
+    @cython.ccall
+    def filter_fc(self, fc_low: float, fc_up: float = -1):
         """Filter peaks in a given fc range.
 
-        If fc_low and fc_up is assigned, the peaks with fc in [fc_low,fc_up)
+        If fc_low and fc_up is assigned, the peaks with fc in
+        [fc_low,fc_up)
+
+        fc_up has to be a positive number, otherwise it won't be
+        applied.
 
         """
-        cdef:
-            bytes chrom
-            dict peaks
-            dict new_peaks
-            list chrs
-            BroadPeakContent p
+        chrom: bytes
+        peaks: dict
+        new_peaks: dict
+        chrs: list
 
         peaks = self.peaks
         new_peaks = {}
         chrs = list(peaks.keys())
-        if fc_up:
+        if fc_up >= 0:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc']<fc_up]
+                new_peaks[chrom] = [p for p in peaks[chrom] if p['fc'] >= fc_low and p['fc'] < fc_up]
         else:
             for chrom in sorted(chrs):
-                new_peaks[chrom]=[p for p in peaks[chrom] if p['fc'] >= fc_low]
+                new_peaks[chrom] = [p for p in peaks[chrom] if p['fc'] >= fc_low]
         self.peaks = new_peaks
 
-    def total (self):
-        cdef:
-            bytes chrom
-            dict peaks
-            list chrs
-            long x
+    @cython.ccall
+    def total(self):
+        chrom: bytes
+        peaks: dict
+        chrs: list
+        x: cython.long = 0
 
         peaks = self.peaks
         chrs = list(peaks.keys())
-        x = 0
         for chrom in sorted(chrs):
             x += len(peaks[chrom])
         return x
 
-    def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
+    @cython.ccall
+    def write_to_gappedPeak(self, fhd,
+                            name_prefix: bytes = b"peak_",
+                            name: bytes = b'peak',
+                            description: bytes = b"%s",
+                            score_column: str = "score",
+                            trackline: bool = True):
         """Print out peaks in gappedBed format. Only those with stronger enrichment regions are saved.
 
         This format is basically a BED12+3 format.
@@ -1095,24 +1244,49 @@ def write_to_gappedPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'pea
         +--------------+------+----------------------------------------+
 
         """
+        chrs: list
+        n_peak: cython.int = 0
+        peak: BroadPeakContent
+        desc: bytes
+        peakprefix: bytes
+        chrom: bytes
+
         chrs = list(self.peaks.keys())
-        n_peak = 0
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
-        try: desc = description % name
-        except: desc = description
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
+        try:
+            desc = description % name
+        except Exception:
+            desc = description
         if trackline:
-            fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode()) )
+            fhd.write("track name=\"%s\" description=\"%s\" type=gappedPeak nextItemButton=on\n" % (name.decode(), desc.decode()))
         for chrom in sorted(chrs):
             for peak in self.peaks[chrom]:
                 n_peak += 1
                 if peak["thickStart"] != b".":
-                    fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t0\t0\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
-                               %
-                               (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]),
-                                peak["blockNum"],peak["blockSizes"].decode(),peak["blockStarts"].decode(), peak['fc'], peak['pscore'], peak['qscore'] ) )
-
-    def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
+                    fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\t0\t0\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n" %
+                              (chrom.decode(),
+                               peak["start"],
+                               peak["end"],
+                               peakprefix.decode(),
+                               n_peak,
+                               int(10*peak[score_column]),
+                               peak["blockNum"],
+                               peak["blockSizes"].decode(),
+                               peak["blockStarts"].decode(),
+                               peak['fc'],
+                               peak['pscore'],
+                               peak['qscore']))
+
+    @cython.ccall
+    def write_to_Bed12(self, fhd,
+                       name_prefix: bytes = b"peak_",
+                       name: bytes = b'peak',
+                       description: bytes = b"%s",
+                       score_column: str = "score",
+                       trackline: bool = True):
         """Print out peaks in Bed12 format.
 
         +--------------+------+----------------------------------------+
@@ -1167,31 +1341,58 @@ def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', b
         +--------------+------+----------------------------------------+
 
         """
+        chrs: list
+        n_peak: cython.int = 0
+        peakprefix: bytes
+        peak: BroadPeakContent
+        desc: bytes
+        peakprefix: bytes
+        chrom: bytes
+
         chrs = list(self.peaks.keys())
-        n_peak = 0
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
-        try: desc = description % name
-        except: desc = description
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
+        try:
+            desc = description % name
+        except Exception:
+            desc = description
         if trackline:
-            fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode()) )
+            fhd.write("track name=\"%s\" description=\"%s\" type=bed nextItemButton=on\n" % (name.decode(), desc.decode()))
         for chrom in sorted(chrs):
             for peak in self.peaks[chrom]:
                 n_peak += 1
                 if peak["thickStart"] == b".":
                     # this will violate gappedPeak format, since it's a complement like broadPeak line.
-                    fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\n"
-                               %
-                               (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]) ) )
+                    fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\n" %
+                              (chrom.decode(),
+                               peak["start"],
+                               peak["end"],
+                               peakprefix.decode(),
+                               n_peak,
+                               int(10*peak[score_column])))
                 else:
-                    fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n"
-                               %
-                               (chrom.decode(), peak["start"], peak["end"], peakprefix.decode(), n_peak, int(10*peak[score_column]),
-                                peak["thickStart"].decode(), peak["thickEnd"].decode(),
-                                peak["blockNum"], peak["blockSizes"].decode(), peak["blockStarts"].decode() ))
-
-
-    def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):
+                    fhd.write("%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\n" %
+                              (chrom.decode(),
+                               peak["start"],
+                               peak["end"],
+                               peakprefix.decode(),
+                               n_peak,
+                               int(10*peak[score_column]),
+                               peak["thickStart"].decode(),
+                               peak["thickEnd"].decode(),
+                               peak["blockNum"],
+                               peak["blockSizes"].decode(),
+                               peak["blockStarts"].decode()))
+
+    @cython.ccall
+    def write_to_broadPeak(self, fhd,
+                           name_prefix: bytes = b"peak_",
+                           name: bytes = b'peak',
+                           description: bytes = b"%s",
+                           score_column: str = "score",
+                           trackline: bool = True):
         """Print out peaks in broadPeak format.
 
         This format is designed for ENCODE project, and basically a
@@ -1241,16 +1442,20 @@ def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak
         +-----------+------+----------------------------------------+
 
         """
-        cdef int n_peak
-        cdef bytes chrom
-        cdef long s
-        cdef str peakname
+        chrs: list
+        n_peak: cython.int = 0
+        peakprefix: bytes
+        peak: BroadPeakContent
+        peakprefix: bytes
+        chrom: bytes
+        peakname: str
 
         chrs = list(self.peaks.keys())
-        n_peak = 0
         write = fhd.write
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
         if trackline:
             write("track type=broadPeak name=\"%s\" description=\"%s\" nextItemButton=on\n" % (name.decode(), name.decode()))
         for chrom in sorted(chrs):
@@ -1259,13 +1464,21 @@ def write_to_broadPeak (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak
                 these_peaks = list(group)
                 peak = these_peaks[0]
                 peakname = "%s%d" % (peakprefix.decode(), n_peak)
-                fhd.write( "%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" %
-                           (chrom.decode(),peak['start'],peak['end'],peakname,int(10*peak[score_column]),
-                            peak['fc'],peak['pscore'],peak['qscore'] ) )
+                fhd.write("%s\t%d\t%d\t%s\t%d\t.\t%.6g\t%.6g\t%.6g\n" %
+                          (chrom.decode(),
+                           peak['start'],
+                           peak['end'],
+                           peakname,
+                           int(10*peak[score_column]),
+                           peak['fc'],
+                           peak['pscore'],
+                           peak['qscore']))
         return
 
-
-    def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS"):
+    @cython.ccall
+    def write_to_xls(self, ofhd,
+                     name_prefix: bytes = b"%s_peak_",
+                     name: bytes = b"MACS"):
         """Save the peak results in a tab-delimited plain text file
         with suffix .xls.
 
@@ -1273,11 +1486,21 @@ def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS")
         wait... why I have two write_to_xls in this class?
 
         """
+        chrom: bytes
+        chrs: list
+        peakprefix: bytes
+        peaks: dict
+        these_peaks: list
+        peak: BroadPeakContent
+        peakname: str
+
         write = ofhd.write
-        write("\t".join(("chr","start", "end",  "length",  "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
+        write("\t".join(("chr", "start", "end",  "length",  "pileup", "-log10(pvalue)", "fold_enrichment", "-log10(qvalue)", "name"))+"\n")
 
-        try: peakprefix = name_prefix % name
-        except: peakprefix = name_prefix
+        try:
+            peakprefix = name_prefix % name
+        except Exception:
+            peakprefix = name_prefix
 
         peaks = self.peaks
         chrs = list(peaks.keys())
@@ -1288,11 +1511,14 @@ def write_to_xls (self, ofhd, bytes name_prefix=b"%s_peak_", bytes name=b"MACS")
                 these_peaks = list(group)
                 peak = these_peaks[0]
                 peakname = "%s%d" % (peakprefix.decode(), n_peak)
-                write("%s\t%d\t%d\t%d" % (chrom.decode(),peak['start']+1,peak['end'],peak['length']))
-                write("\t%.6g" % (round(peak['pileup'],2))) # pileup height at summit
-                write("\t%.6g" % (peak['pscore'])) # -log10pvalue at summit
-                write("\t%.6g" % (peak['fc'])) # fold change at summit
-                write("\t%.6g" % (peak['qscore'])) # -log10qvalue at summit
+                write("%s\t%d\t%d\t%d" % (chrom.decode(),
+                                          peak['start']+1,
+                                          peak['end'],
+                                          peak['length']))
+                write("\t%.6g" % (round(peak['pileup'], 2)))  # pileup height at summit
+                write("\t%.6g" % (peak['pscore']))  # -log10pvalue at summit
+                write("\t%.6g" % (peak['fc']))  # fold change at summit
+                write("\t%.6g" % (peak['qscore']))  # -log10qvalue at summit
                 write("\t%s" % peakname)
                 write("\n")
         return
diff --git a/MACS3/Signal/BedGraph.py b/MACS3/Signal/BedGraph.py
new file mode 100644
index 00000000..baa46af4
--- /dev/null
+++ b/MACS3/Signal/BedGraph.py
@@ -0,0 +1,2844 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-15 16:18:23 Tao Liu>
+
+"""Module for BedGraph data class.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+import cython
+from array import array as pyarray
+from math import prod
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+from MACS3.Signal.ScoreTrack import ScoreTrackII
+from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, PeakContent
+from MACS3.Signal.Prob import chisq_logp_e
+
+# ------------------------------------
+# Other modules
+# ------------------------------------
+
+from cython.cimports.cpython import bool
+import numpy as np
+import cython.cimports.numpy as cnp
+
+# ------------------------------------
+# C lib
+# ------------------------------------
+
+from cython.cimports.libc.math import sqrt, log10
+
+# ------------------------------------
+# constants
+# ------------------------------------
+LOG10_E = 0.43429448190325176
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+
+@cython.inline
+@cython.cfunc
+def mean_func(x):
+    return sum(x)/len(x)
+
+
+@cython.inline
+@cython.cfunc
+def fisher_func(x):
+    # combine -log10pvalues
+    return chisq_logp_e(2*sum(x)/LOG10_E, 2*len(x), log10=True)
+
+
+@cython.inline
+@cython.cfunc
+def subtract_func(x):
+    # subtraction of two items list
+    return x[1] - x[0]
+
+
+@cython.inline
+@cython.cfunc
+def divide_func(x):
+    # division of two items list
+    return x[1] / x[2]
+
+
+@cython.inline
+@cython.cfunc
+def product_func(x):
+    # production of a list of values
+    # only python 3.8 or above
+    return prod(x)
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class bedGraphTrackI:
+    """Class for bedGraph type data.
+
+    In bedGraph, data are represented as continuous non-overlapping
+    regions in the whole genome. I keep this assumption in all the
+    functions. If data has overlaps, some functions will definitely
+    give incorrect results.
+
+    1. Continuous: the next region should be after the previous one
+    unless they are on different chromosomes;
+
+    2. Non-overlapping: the next region should never have overlaps
+    with preceding region.
+
+    The way to memorize bedGraph data is to remember the transition
+    points together with values of their preceding regions. The last
+    data point may exceed chromosome end, unless a chromosome
+    dictionary is given. Remember the coordinations in bedGraph and
+    this class is 0-indexed and right-open.
+
+    """
+    __data: dict
+    maxvalue = cython.declare(cython.float, visibility="public")
+    minvalue = cython.declare(cython.float, visibility="public")
+    baseline_value = cython.declare(cython.float, visibility="public")
+
+    def __init__(self, baseline_value: cython.float = 0):
+        """
+        baseline_value is the value to fill in the regions not defined
+        in bedGraph. For example, if the bedGraph is like:
+
+        chr1  100 200  1
+        chr1  250 350  2
+
+        Then the region chr1:200..250 should be filled with baseline_value.
+
+        """
+        self.__data = {}
+        self.maxvalue = -10000000  # initial maximum value is tiny since I want safe_add_loc to update it
+        self.minvalue = 10000000  # initial minimum value is large since I want safe_add_loc to update it
+        self.baseline_value = baseline_value
+
+    @cython.ccall
+    def add_loc(self, chromosome: bytes,
+                startpos: cython.int,
+                endpos: cython.int,
+                value: cython.float):
+        """Add a chr-start-end-value block into __data dictionary.
+
+        Note, we don't check if the add_loc is called continuously on
+        sorted regions without any gap. So we only suggest calling
+        this function within MACS.
+
+        """
+        pre_v: cython.float
+
+        # basic assumption, end pos should > start pos
+
+        if endpos <= 0:
+            return
+        if startpos < 0:
+            startpos = 0
+
+        if chromosome not in self.__data:
+            self.__data[chromosome] = [pyarray('i', []),
+                                       pyarray('f', [])]
+            c = self.__data[chromosome]
+            if startpos:
+                # start pos is not 0, then add two blocks, the first
+                # with "baseline_value"; the second with "value"
+                c[0].append(startpos)
+                c[1].append(self.baseline_value)
+            c[0].append(endpos)
+            c[1].append(value)
+        else:
+            c = self.__data[chromosome]
+            # get the preceding region
+            pre_v = c[1][-1]
+
+            # if this region is next to the previous one.
+            if pre_v == value:
+                # if value is the same, simply extend it.
+                c[0][-1] = endpos
+            else:
+                # otherwise, add a new region
+                c[0].append(endpos)
+                c[1].append(value)
+
+        if value > self.maxvalue:
+            self.maxvalue = value
+        if value < self.minvalue:
+            self.minvalue = value
+
+    @cython.ccall
+    def add_loc_wo_merge(self, chromosome: bytes,
+                         startpos: cython.int,
+                         endpos: cython.int,
+                         value: cython.float):
+        """Add a chr-start-end-value block into __data dictionary.
+
+        Note, we don't check if the add_loc is called continuously on
+        sorted regions without any gap. So we only suggest calling
+        this function within MACS.
+
+        This one won't merge nearby ranges with the same value
+        """
+        if endpos <= 0:
+            return
+        if startpos < 0:
+            startpos = 0
+
+        if value < self.baseline_value:
+            value = self.baseline_value
+
+        if chromosome not in self.__data:
+            self.__data[chromosome] = [pyarray('i', []),
+                                       pyarray('f', [])]
+            c = self.__data[chromosome]
+            if startpos:
+                # start pos is not 0, then add two blocks, the first
+                # with "baseline_value"; the second with "value"
+                c[0].append(startpos)
+                c[1].append(self.baseline_value)
+        c = self.__data[chromosome]
+        c[0].append(endpos)
+        c[1].append(value)
+        if value > self.maxvalue:
+            self.maxvalue = value
+        if value < self.minvalue:
+            self.minvalue = value
+
+    @cython.ccall
+    def add_chrom_data(self,
+                       chromosome: bytes,
+                       p: pyarray,
+                       v: pyarray):
+        """Add a pv data to a chromosome. Replace the previous data.
+
+        p: a pyarray object 'i' for positions
+        v: a pyarray object 'f' for values
+
+        Note: no checks for error, use with caution
+        """
+        maxv: cython.float
+        minv: cython.float
+
+        self.__data[chromosome] = [p, v]
+        maxv = max(v)
+        minv = min(v)
+        if maxv > self.maxvalue:
+            self.maxvalue = maxv
+        if minv < self.minvalue:
+            self.minvalue = minv
+        return
+
+    @cython.ccall
+    def add_chrom_data_PV(self,
+                          chromosome: bytes,
+                          pv: cnp.ndarray):
+        """Add a pv data to a chromosome. Replace the previous data.
+
+        This is a kinda silly function to waste time and convert a PV
+        array (2-d named numpy array) into two python arrays for this
+        BedGraph class. May have better function later.
+
+        Note: no checks for error, use with caution
+        """
+        maxv: cython.float
+        minv: cython.float
+
+        self.__data[chromosome] = [pyarray('i', pv['p']),
+                                   pyarray('f', pv['v'])]
+        minv = pv['v'].min()
+        maxv = pv['v'].max()
+        if maxv > self.maxvalue:
+            self.maxvalue = maxv
+        if minv < self.minvalue:
+            self.minvalue = minv
+        return
+
+    @cython.ccall
+    def destroy(self) -> bool:
+        """ destroy content, free memory.
+        """
+        chrs: set
+        chrom: bytes
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            if chrom in self.__data:
+                self.__data[chrom] = [None, None]
+                self.__data.pop(chrom)
+        return True
+
+    @cython.ccall
+    def get_data_by_chr(self, chromosome: bytes) -> list:
+        """Return array of counts by chromosome.
+
+        The return value is a tuple:
+        ([end pos],[value])
+        """
+        if chromosome in self.__data:
+            return self.__data[chromosome]
+        else:
+            return []
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names stored.
+
+        """
+        return set(sorted(self.__data.keys()))
+
+    @cython.ccall
+    def reset_baseline(self, baseline_value: cython.float):
+        """Reset baseline value to baseline_value.
+
+        So any region between self.baseline_value and baseline_value
+        will be set to baseline_value.
+
+        """
+        self.baseline_value = baseline_value
+        self.filter_score(cutoff=baseline_value)
+        self.merge_regions()
+        return
+
+    @cython.cfunc
+    def merge_regions(self):
+        """Merge nearby regions with the same value.
+
+        """
+        # new_pre_pos: cython.int
+        pos: cython.int
+        i: cython.int
+        new_pre_value: cython.float
+        value: cython.float
+        chrom: bytes
+        chrs: set
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            (p, v) = self.__data[chrom]
+            pnext = iter(p).__next__
+            vnext = iter(v).__next__
+
+            # new arrays
+            new_pos = pyarray('L', [pnext(),])
+            new_value = pyarray('f', [vnext(),])
+
+            newpa = new_pos.append
+            newva = new_value.append
+
+            # new_pre_pos = new_pos[0]
+            new_pre_value = new_value[0]
+
+            for i in range(1, len(p)):
+                pos = pnext()
+                value = vnext()
+                if value == new_pre_value:
+                    new_pos[-1] = pos
+                else:
+                    # add new region
+                    newpa(pos)
+                    newva(value)
+                    # new_pre_pos = pos
+                    new_pre_value = value
+            self.__data[chrom] = [new_pos, new_value]
+        return True
+
+    @cython.ccall
+    def filter_score(self, cutoff: cython.float = 0) -> bool:
+        """Filter using a score cutoff. Any region lower than score
+        cutoff will be set to self.baseline_value.
+
+        Self will be modified.
+        """
+        # new_pre_pos: cython.int
+        pos: cython.int
+        i: cython.int
+        new_pre_value: cython.float
+        value: cython.float
+        chrom: bytes
+        chrs: set
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            (p, v) = self.__data[chrom]
+            pnext = iter(p).__next__
+            vnext = iter(v).__next__
+
+            # new arrays
+            new_pos = pyarray('L', [])
+            new_value = pyarray('f', [])
+            # new_pre_pos = 0
+            new_pre_value = 0
+
+            for i in range(len(p)):
+                pos = pnext()
+                value = vnext()
+
+                if value < cutoff:
+                    # this region will be set to baseline_value
+                    if new_pre_value == self.baseline_value:
+                        # if preceding region is at baseline, extend it
+                        new_pos[-1] = pos
+                    else:
+                        # else add a new baseline region
+                        new_pos.append(pos)
+                        new_value.append(self.baseline_value)
+                else:
+                    # put it into new arrays
+                    new_pos.append(pos)
+                    new_value.append(value)
+                # new_pre_pos = new_pos[-1]
+                new_pre_value = new_value[-1]
+            self.__data[chrom] = [new_pos, new_value]
+        return True
+
+    @cython.ccall
+    def summary(self) -> tuple:
+        """Calculate the sum, total_length, max, min, mean, and std.
+
+        Return a tuple for (sum, total_length, max, min, mean, std).
+
+        """
+        n_v: cython.long
+        sum_v: cython.float
+        max_v: cython.float
+        min_v: cython.float
+        mean_v: cython.float
+        variance: cython.float
+        tmp: cython.float
+        std_v: cython.float
+        pre_p: cython.int
+        ln: cython.int
+        i: cython.int
+
+        pre_p = 0
+        n_v = 0
+        sum_v = 0
+        max_v = -100000
+        min_v = 100000
+        for (p, v) in self.__data.values():
+            # for each chromosome
+            pre_p = 0
+            for i in range(len(p)):
+                # for each region
+                ln = p[i]-pre_p
+                sum_v += v[i]*ln
+                n_v += ln
+                pre_p = p[i]
+            max_v = max(max(v), max_v)
+            min_v = min(min(v), min_v)
+        mean_v = sum_v/n_v
+        variance = 0.0
+        for (p, v) in self.__data.values():
+            for i in range(len(p)):
+                # for each region
+                tmp = v[i]-mean_v
+                ln = p[i]-pre_p
+                variance += tmp*tmp*ln
+                pre_p = p[i]
+
+        variance /= float(n_v-1)
+        std_v = sqrt(variance)
+        return (sum_v, n_v, max_v, min_v, mean_v, std_v)
+
+    @cython.ccall
+    def call_peaks(self,
+                   cutoff: cython.float = 1,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 50,
+                   call_summits: bool = False):
+        """This function try to find regions within which, scores
+        are continuously higher than a given cutoff.
+
+        This function is NOT using sliding-windows. Instead, any
+        regions in bedGraph above certain cutoff will be detected,
+        then merged if the gap between nearby two regions are below
+        max_gap. After this, peak is reported if its length is above
+        min_length.
+
+        cutoff:  cutoff of value, default 1.
+        min_length :  minimum peak length, default 200.
+        gap   :  maximum gap to merge nearby peaks, default 50.
+
+        Removed option:
+
+        up_limit: the highest acceptable value. Default 10^{310}
+          * so only allow peak with value >=cutoff and <=up_limit
+
+        This does not work. The region above upper limit may still be
+        included as `gap` .
+
+        """
+        # peak_length: cython.int
+        x: cython.int
+        pre_p: cython.int
+        p: cython.int
+        i: cython.int
+        v: cython.float
+        chrom: bytes
+        chrs: set
+
+        chrs = self.get_chr_names()
+        peaks = PeakIO()                      # dictionary to save peaks
+        for chrom in sorted(chrs):
+            peak_content = None
+            # peak_length = 0
+            (ps, vs) = self.get_data_by_chr(chrom)  # arrays for position and values
+            psn = iter(ps).__next__         # assign the next function to a viable to speed up
+            vsn = iter(vs).__next__
+            x = 0
+            pre_p = 0                   # remember previous position
+            while True:
+                # find the first region above cutoff
+                try:                    # try to read the first data range for this chrom
+                    p = psn()
+                    v = vsn()
+                except Exception:
+                    break
+                x += 1                  # index for the next point
+                if v >= cutoff:
+                    peak_content = [(pre_p, p, v),]
+                    pre_p = p
+                    break               # found the first range above cutoff
+                else:
+                    pre_p = p
+
+            for i in range(x, len(ps)):
+                # continue scan the rest regions
+                p = psn()
+                v = vsn()
+                if v < cutoff:  # not be detected as 'peak'
+                    pre_p = p
+                    continue
+                # for points above cutoff
+                # if the gap is allowed
+                if pre_p - peak_content[-1][1] <= max_gap:
+                    peak_content.append((pre_p, p, v))
+                else:
+                    # when the gap is not allowed, close this peak
+                    self.__close_peak(peak_content,
+                                      peaks,
+                                      min_length,
+                                      chrom)  # , smoothlen=max_gap / 2)
+                    # start a new peak
+                    peak_content = [(pre_p, p, v),]
+                pre_p = p
+
+            # save the last peak
+            if not peak_content:
+                continue
+            self.__close_peak(peak_content,
+                              peaks,
+                              min_length,
+                              chrom)  # , smoothlen=max_gap / 2)
+        return peaks
+
+    @cython.cfunc
+    def __close_peak(self,
+                     peak_content: list,
+                     peaks,
+                     min_length: cython.int,
+                     chrom: bytes) -> bool:
+        tsummit: list           # list for temporary summits
+        peak_length: cython.int
+        summit: cython.int
+        tstart: cython.int
+        tend: cython.int
+        summit_value: cython.float
+        tvalue: cython.float
+        peak_length = peak_content[-1][1]-peak_content[0][0]
+        if peak_length >= min_length:  # if the peak is too small, reject it
+            tsummit = []
+            summit = 0
+            summit_value = 0
+            for (tstart, tend, tvalue) in peak_content:
+                if not summit_value or summit_value < tvalue:
+                    tsummit = [cython.cast(cython.int, (tend+tstart)/2),]
+                    summit_value = tvalue
+                elif summit_value == tvalue:
+                    tsummit.append(cython.cast(cython.int, (tend+tstart)/2))
+            summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1]
+            peaks.add(chrom,
+                      peak_content[0][0],
+                      peak_content[-1][1],
+                      summit=summit,
+                      peak_score=summit_value,
+                      pileup=0,
+                      pscore=0,
+                      fold_change=0,
+                      qscore=0
+                      )
+            return True
+
+    @cython.ccall
+    def call_broadpeaks(self,
+                        lvl1_cutoff: cython.float = 500,
+                        lvl2_cutoff: cython.float = 100,
+                        min_length: cython.int = 200,
+                        lvl1_max_gap: cython.int = 50,
+                        lvl2_max_gap: cython.int = 400):
+        """This function try to find enriched regions within which,
+        scores are continuously higher than a given cutoff for level
+        1, and link them using the gap above level 2 cutoff with a
+        maximum length of lvl2_max_gap.
+
+        lvl1_cutoff:  cutoff of value at enriched regions, default 500.
+        lvl2_cutoff:  cutoff of value at linkage regions, default 100.
+        min_length :  minimum peak length, default 200.
+        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
+        lvl2_max_gap   :  maximum length of linkage regions, default 400.
+        colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
+
+        Return both general PeakIO object for highly enriched regions
+        and gapped broad regions in BroadPeakIO.
+        """
+        chrom: bytes
+        i: cython.int
+        j: cython.int
+        chrs: set
+        lvl1: PeakContent
+        lvl2: PeakContent   # PeakContent class object
+        lvl1peakschrom: list
+        lvl2peakschrom: list
+
+        assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
+        assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
+        lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl1_max_gap,
+                                     call_summits=False)
+        lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl2_max_gap,
+                                     call_summits=False)
+        chrs = lvl1_peaks.get_chr_names()
+        broadpeaks = BroadPeakIO()
+        # use lvl2_peaks as linking regions between lvl1_peaks
+        for chrom in sorted(chrs):
+            lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom)
+            lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom)
+            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
+            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
+            # our assumption is lvl1 regions should be included in lvl2 regions
+            try:
+                lvl1 = lvl1peakschrom_next()
+                for i in range(len(lvl2peakschrom)):
+                    # for each lvl2 peak, find all lvl1 peaks inside
+                    lvl2 = lvl2peakschrom[i]
+                    while True:
+                        if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
+                            tmppeakset.append(lvl1)
+                            lvl1 = lvl1peakschrom_next()
+                        else:
+                            self.__add_broadpeak(broadpeaks,
+                                                 chrom,
+                                                 lvl2,
+                                                 tmppeakset)
+                            tmppeakset = []
+                            break
+            except StopIteration:
+                self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset)
+                tmppeakset = []
+                for j in range(i+1, len(lvl2peakschrom)):
+                    self.__add_broadpeak(broadpeaks,
+                                         chrom,
+                                         lvl2peakschrom[j],
+                                         tmppeakset)
+        return broadpeaks
+
+    @cython.cfunc
+    def __add_broadpeak(self,
+                        bpeaks,
+                        chrom: bytes,
+                        lvl2peak: PeakContent,
+                        lvl1peakset: list):
+        """Internal function to create broad peak.
+        """
+        start: cython.int
+        end: cython.int
+        blockNum: cython.int
+        blockSizes: bytes
+        blockStarts: bytes
+        thickStart: bytes
+        thickEnd: bytes
+
+        start = lvl2peak["start"]
+        end = lvl2peak["end"]
+
+        # the following code will add those broad/lvl2 peaks with no
+        # strong/lvl1 peaks inside
+        if not lvl1peakset:
+            # try:
+            # will complement by adding 1bps start and end to this region
+            # may change in the future if gappedPeak format was improved.
+            bpeaks.add(chrom, start, end,
+                       score=lvl2peak["score"],
+                       thickStart=(b"%d" % start),
+                       thickEnd=(b"%d" % end),
+                       blockNum=2,
+                       blockSizes=b"1,1",
+                       blockStarts=(b"0,%d" % (end-start-1)),
+                       pileup=lvl2peak["pileup"],
+                       pscore=lvl2peak["pscore"],
+                       fold_change=lvl2peak["fc"],
+                       qscore=lvl2peak["qscore"])
+            return bpeaks
+
+        thickStart = b"%d" % lvl1peakset[0]["start"]
+        thickEnd = b"%d" % lvl1peakset[-1]["end"]
+        blockNum = len(lvl1peakset)
+        blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
+        blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
+
+        if int(thickStart) != start:
+            # add 1bp left block
+            thickStart = b"%d" % start
+            blockNum += 1
+            blockSizes = b"1,"+blockSizes
+            blockStarts = b"0,"+blockStarts
+        if int(thickEnd) != end:
+            # add 1bp right block
+            thickEnd = b"%d" % end
+            blockNum += 1
+            blockSizes = blockSizes+b",1"
+            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
+
+        bpeaks.add(chrom, start, end,
+                   score=lvl2peak["score"],
+                   thickStart=thickStart,
+                   thickEnd=thickEnd,
+                   blockNum=blockNum,
+                   blockSizes=blockSizes,
+                   blockStarts=blockStarts,
+                   pileup=lvl2peak["pileup"],
+                   pscore=lvl2peak["pscore"],
+                   fold_change=lvl2peak["fc"],
+                   qscore=lvl2peak["qscore"])
+        return bpeaks
+
+    @cython.ccall
+    def refine_peaks(self, peaks):
+        """This function try to based on given peaks, re-evaluate the
+        peak region, call the summit.
+
+        peaks: PeakIO object
+        return: a new PeakIO object
+
+        """
+        pre_p: cython.int
+        p: cython.int
+        peak_s: cython.int
+        peak_e: cython.int
+        v: cython.float
+        chrom: bytes
+        chrs: set
+
+        peaks.sort()
+        new_peaks = PeakIO()
+        chrs = self.get_chr_names()
+        assert isinstance(peaks, PeakIO)
+        chrs = chrs.intersection(set(peaks.get_chr_names()))
+
+        for chrom in sorted(chrs):
+            peaks_chr = peaks.get_data_from_chrom(chrom)
+            peak_content = []
+            # arrays for position and values
+            (ps, vs) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            psn = iter(ps).__next__
+            vsn = iter(vs).__next__
+            peakn = iter(peaks_chr).__next__
+
+            # remember previous position in bedgraph/self
+            pre_p = 0
+            p = psn()
+            v = vsn()
+            peak = peakn()
+            peak_s = peak["start"]
+            peak_e = peak["end"]
+            while True:
+                # look for overlap
+                if p > peak_s and peak_e > pre_p:
+                    # now put four coordinates together and pick the middle two
+                    s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
+                    # add this content
+                    peak_content.append((s, e, v))
+                    # move self/bedGraph
+                    try:
+                        pre_p = p
+                        p = psn()
+                        v = vsn()
+                    except Exception:
+                        # no more value chunk in bedGraph
+                        break
+                elif pre_p >= peak_e:
+                    # close peak
+                    self.__close_peak(peak_content, new_peaks, 0, chrom)
+                    peak_content = []
+                    # move peak
+                    try:
+                        peak = peakn()
+                        peak_s = peak["start"]
+                        peak_e = peak["end"]
+                    except Exception:
+                        # no more peak
+                        break
+                elif peak_s >= p:
+                    # move self/bedgraph
+                    try:
+                        pre_p = p
+                        p = psn()
+                        v = vsn()
+                    except Exception:
+                        # no more value chunk in bedGraph
+                        break
+                else:
+                    raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
+
+            # save the last peak
+            if peak_content:
+                self.__close_peak(peak_content, new_peaks, 0, chrom)
+        return new_peaks
+
+    @cython.ccall
+    def total(self) -> cython.int:
+        """Return the number of regions in this object.
+
+        """
+        t: cython.int
+        t = 0
+        for (p, v) in self.__data.values():
+            t += len(p)
+        return t
+
+    @cython.ccall
+    def set_single_value(self, new_value: cython.float):
+        """Change all the values in bedGraph to the same new_value,
+        return a new bedGraphTrackI.
+
+        """
+        chrom: bytes
+        max_p: cython.int
+
+        ret = bedGraphTrackI()
+        chroms = set(self.get_chr_names())
+        for chrom in sorted(chroms):
+            # arrays for position and values
+            (p1, v1) = self.get_data_by_chr(chrom)
+            # maximum p
+            max_p = max(p1)
+            # add a region from 0 to max_p
+            ret.add_loc(chrom, 0, max_p, new_value)
+        return ret
+
+    @cython.ccall
+    def overlie(self, bdgTracks, func: str = "max"):
+        """Calculate two or more bedGraphTrackI objects by letting self
+        overlying bdgTrack2, with user-defined functions.
+
+        Transition positions from both bedGraphTrackI objects will be
+        considered and combined. For example:
+
+           #1 bedGraph (self)   |      #2 bedGraph
+        -----------------------------------------------
+        chr1  0    100  0       | chr1    0    150  1
+        chr1  100  200  3       | chr1    150  250  2
+        chr1  200  300  4       | chr1    250  300  4
+
+        these two bedGraphs will be combined to have five transition
+        points: 100, 150, 200, 250, and 300. So in order to calculate
+        two bedGraphs, I pair values within the following regions
+        like:
+
+        chr   s   e     (#1,#2)   applied_func_max
+        -----------------------------------------------
+        chr1  0   100   (0,1)     1
+        chr1  100 150   (3,1)     3
+        chr1  150 200   (3,2)     3
+        chr1  200 250   (4,2)     4
+        chr1  250 300   (4,4)     4
+
+        Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
+
+        Supported 'func' are "sum", "subtract" (only for two bdg
+        objects), "product", "divide" (only for two bdg objects),
+        "max", "mean" and "fisher".
+
+        Return value is a new bedGraphTrackI object.
+
+        Option: bdgTracks can be a list of bedGraphTrackI objects
+        """
+        pre_p: cython.int
+        chrom: bytes
+
+        nr_tracks = len(bdgTracks) + 1  # +1 for self
+        assert nr_tracks >= 2, "Specify at least one more bdg objects."
+        for i, bdgTrack in enumerate(bdgTracks):
+            assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1)
+
+        if func == "max":
+            f = max
+        elif func == "mean":
+            f = mean_func
+        elif func == "fisher":
+            f = fisher_func
+        elif func == "sum":
+            f = sum
+        elif func == "product":
+            f = product_func
+        elif func == "subtract":
+            if nr_tracks == 2:
+                f = subtract_func
+            else:
+                raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
+        elif func == "divide":
+            if nr_tracks == 2:
+                f = divide_func
+            else:
+                raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
+        else:
+            raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
+
+        ret = bedGraphTrackI()
+
+        common_chr = set(self.get_chr_names())
+        for track in bdgTracks:
+            common_chr = common_chr.intersection(set(track.get_chr_names()))
+
+        for chrom in sorted(common_chr):
+            datas = [self.get_data_by_chr(chrom)]
+            datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))])
+
+            ps, vs, pn, vn = [], [], [], []
+            for data in datas:
+                ps.append(data[0])
+                pn.append(iter(ps[-1]).__next__)
+                vs.append(data[1])
+                vn.append(iter(vs[-1]).__next__)
+
+            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
+            try:
+                ps_cur = [pn[i]() for i in range(len(pn))]
+                vs_cur = [vn[i]() for i in range(len(pn))]
+
+                while True:
+                    # get the lowest position
+                    lowest_p = min(ps_cur)
+
+                    # at least one lowest position, could be multiple
+                    locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p]
+
+                    # add the data until the interval
+                    ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur))
+
+                    pre_p = ps_cur[locations[0]]
+                    for index in locations:
+                        ps_cur[index] = pn[index]()
+                        vs_cur[index] = vn[index]()
+            except StopIteration:
+                # meet the end of either bedGraphTrackI, simply exit
+                pass
+        return ret
+
+    @cython.ccall
+    def apply_func(self, func) -> bool:
+        """Apply function 'func' to every value in this bedGraphTrackI object.
+
+        *Two adjacent regions with same value after applying func will
+        not be merged.
+        """
+        i: cython.int
+
+        for (p, s) in self.__data.values():
+            for i in range(len(s)):
+                s[i] = func(s[i])
+        self.maxvalue = func(self.maxvalue)
+        self.minvalue = func(self.minvalue)
+        return True
+
+    @cython.ccall
+    def p2q(self):
+        """Convert pvalue scores to qvalue scores.
+
+        *Assume scores in this bedGraph are pvalue scores! Not work
+         for other type of scores.
+        """
+        chrom: bytes
+        pos_array: pyarray
+        pscore_array: pyarray
+        pvalue_stat: dict = {}
+        pqtable: dict = {}
+        pre_p: cython.long
+        this_p: cython.long
+        # pre_l: cython.long
+        # l: cython.long
+        i: cython.long
+        nhcal: cython.long = 0
+        N: cython.long
+        k: cython.long
+        this_l: cython.long
+        this_v: cython.float
+        # pre_v: cython.float
+        v: cython.float
+        q: cython.float
+        pre_q: cython.float
+        f: cython.float
+        unique_values: list
+
+        # calculate frequencies of each p-score
+        for chrom in sorted(self.get_chr_names()):
+            pre_p = 0
+
+            [pos_array, pscore_array] = self.__data[chrom]
+
+            pn = iter(pos_array).__next__
+            vn = iter(pscore_array).__next__
+
+            for i in range(len(pos_array)):
+                this_p = pn()
+                this_v = vn()
+                this_l = this_p - pre_p
+                if this_v in pvalue_stat:
+                    pvalue_stat[this_v] += this_l
+                else:
+                    pvalue_stat[this_v] = this_l
+                pre_p = this_p
+
+            # nhcal += len(pos_array)
+
+        # nhval = 0
+
+        N = sum(pvalue_stat.values())  # total length
+        k = 1                          # rank
+        f = -log10(N)
+        # pre_v = -2147483647
+        # pre_l = 0
+        pre_q = 2147483647              # save the previous q-value
+
+        # calculate qscore for each pscore
+        pqtable = {}
+        unique_values = sorted(pvalue_stat.keys(), reverse=True)
+        for i in range(len(unique_values)):
+            v = unique_values[i]
+            # l = pvalue_stat[v]
+            q = v + (log10(k) + f)
+            q = max(0, min(pre_q, q))           # make q-score monotonic
+            pqtable[v] = q
+            # pre_v = v
+            pre_q = q
+            # k += l
+            nhcal += 1
+
+        # convert pscore to qscore
+        for chrom in sorted(self.get_chr_names()):
+            [pos_array, pscore_array] = self.__data[chrom]
+
+            for i in range(len(pos_array)):
+                pscore_array[i] = pqtable[pscore_array[i]]
+
+        self.merge_regions()
+        return
+
+    @cython.ccall
+    def extract_value(self, bdgTrack2):
+        """Extract values from regions defined in bedGraphTrackI class object
+        `bdgTrack2`.
+
+        """
+        pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        i: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
+
+        assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+        # 1: region in bdgTrack2; 2: value; 3: length with the value
+        ret = [[], pyarray('f', []), pyarray('L', [])]
+        radd = ret[0].append
+        vadd = ret[1].append
+        ladd = ret[2].append
+
+        chr1 = set(self.get_chr_names())
+        chr2 = set(bdgTrack2.get_chr_names())
+        common_chr = chr1.intersection(chr2)
+        for i in range(len(common_chr)):
+            chrom = common_chr.pop()
+            (p1s, v1s) = self.get_data_by_chr(chrom)  # arrays for position and values
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
+            v1n = iter(v1s).__next__
+
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
+            v2n = iter(v2s).__next__
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            pre_p = 0
+            try:
+                p1 = p1n()
+                v1 = v1n()
+
+                p2 = p2n()
+                v2 = v2n()
+
+                while True:
+                    if p1 < p2:
+                        # clip a region from pre_p to p1, then set pre_p as p1.
+                        if v2 > 0:
+                            radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+                            vadd(v1)
+                            ladd(p1-pre_p)
+                        pre_p = p1
+                        # call for the next p1 and v1
+                        p1 = p1n()
+                        v1 = v1n()
+                    elif p2 < p1:
+                        # clip a region from pre_p to p2, then set
+                        # pre_p as p2.
+                        if v2 > 0:
+                            radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
+                            vadd(v1)
+                            ladd(p2-pre_p)
+                        pre_p = p2
+                        # call for the next p2 and v2
+                        p2 = p2n()
+                        v2 = v2n()
+                    elif p1 == p2:
+                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+                        if v2 > 0:
+                            radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+                            vadd(v1)
+                            ladd(p1-pre_p)
+                        pre_p = p1
+                        # call for the next p1, v1, p2, v2.
+                        p1 = p1n()
+                        v1 = v1n()
+                        p2 = p2n()
+                        v2 = v2n()
+            except StopIteration:
+                # meet the end of either bedGraphTrackI, simply exit
+                pass
+
+        return ret
+
+    @cython.ccall
+    def extract_value_hmmr(self, bdgTrack2):
+        """Extract values from regions defined in bedGraphTrackI class object
+        `bdgTrack2`.
+
+        I will try to tweak this function to output only the values of
+        bdgTrack1 (self) in the regions in bdgTrack2
+
+        This is specifically for HMMRATAC. bdgTrack2 should be a
+        bedgraph object containing the bins with value set to
+        'mark_bin' -- the bins in the same region will have the same
+        value.
+        """
+        # pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        i: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
+        ret: list
+
+        assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+        # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
+        ret = [[], pyarray('f', []), pyarray('i', [])]
+        padd = ret[0].append
+        vadd = ret[1].append
+        ladd = ret[2].append
+
+        chr1 = set(self.get_chr_names())
+        chr2 = set(bdgTrack2.get_chr_names())
+        common_chr = sorted(list(chr1.intersection(chr2)))
+        for i in range(len(common_chr)):
+            chrom = common_chr.pop()
+            # arrays for position and values
+            (p1s, v1s) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
+            v1n = iter(v1s).__next__
+
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
+            v2n = iter(v2s).__next__
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            # pre_p = 0
+            try:
+                p1 = p1n()
+                v1 = v1n()
+
+                p2 = p2n()
+                v2 = v2n()
+
+                while True:
+                    if p1 < p2:
+                        # clip a region from pre_p to p1, then set pre_p as p1.
+                        # in this case, we don't output any
+                        # if v2>0:
+                        #    radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+                        #    vadd(v1)
+                        #    ladd(p1-pre_p)
+                        # pre_p = p1
+                        # call for the next p1 and v1
+                        p1 = p1n()
+                        v1 = v1n()
+                    elif p2 < p1:
+                        # clip a region from pre_p to p2, then set pre_p as p2.
+                        if v2 != 0:  # 0 means it's a gap region, we should have value > 1
+                            padd((chrom, p2))
+                            vadd(v1)
+                            ladd(int(v2))
+                        # pre_p = p2
+                        # call for the next p2 and v2
+                        p2 = p2n()
+                        v2 = v2n()
+                    elif p1 == p2:
+                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+                        if v2 != 0: # 0 means it's a gap region, we should have 1 or -1
+                            padd((chrom, p2))
+                            vadd(v1)
+                            ladd(int(v2))
+                        # pre_p = p1
+                        # call for the next p1, v1, p2, v2.
+                        p1 = p1n()
+                        v1 = v1n()
+                        p2 = p2n()
+                        v2 = v2n()
+            except StopIteration:
+                # meet the end of either bedGraphTrackI, simply exit
+                pass
+
+        return ret
+
+    @cython.ccall
+    def make_ScoreTrackII_for_macs(self, bdgTrack2,
+                                   depth1: float = 1.0,
+                                   depth2: float = 1.0):
+        """A modified overlie function for MACS v2.
+
+        effective_depth_in_million: sequencing depth in million after
+                                    duplicates being filtered. If
+                                    treatment is scaled down to
+                                    control sample size, then this
+                                    should be control sample size in
+                                    million. And vice versa.
+
+        Return value is a ScoreTrackII object.
+        """
+        # pre_p: cython.int
+        p1: cython.int
+        p2: cython.int
+        v1: cython.float
+        v2: cython.float
+        chrom: bytes
+
+        assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
+
+        ret = ScoreTrackII(treat_depth=depth1,
+                           ctrl_depth=depth2)
+        retadd = ret.add
+
+        chr1 = set(self.get_chr_names())
+        chr2 = set(bdgTrack2.get_chr_names())
+        common_chr = chr1.intersection(chr2)
+        for chrom in sorted(common_chr):
+            # arrays for position and values
+            (p1s, v1s) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p1n = iter(p1s).__next__
+            v1n = iter(v1s).__next__
+            # arrays for position and values
+            (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            p2n = iter(p2s).__next__
+            v2n = iter(v2s).__next__
+
+            # this is the maximum number of locations needed to be
+            # recorded in scoreTrackI for this chromosome.
+            chrom_max_len = len(p1s)+len(p2s)
+
+            ret.add_chromosome(chrom, chrom_max_len)
+
+            # remember the previous position in the new bedGraphTrackI
+            # object ret
+            # pre_p = 0
+
+            try:
+                p1 = p1n()
+                v1 = v1n()
+
+                p2 = p2n()
+                v2 = v2n()
+
+                while True:
+                    if p1 < p2:
+                        # clip a region from pre_p to p1, then set pre_p as p1.
+                        retadd(chrom, p1, v1, v2)
+                        # pre_p = p1
+                        # call for the next p1 and v1
+                        p1 = p1n()
+                        v1 = v1n()
+                    elif p2 < p1:
+                        # clip a region from pre_p to p2, then set pre_p as p2.
+                        retadd(chrom, p2, v1, v2)
+                        # pre_p = p2
+                        # call for the next p2 and v2
+                        p2 = p2n()
+                        v2 = v2n()
+                    elif p1 == p2:
+                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+                        retadd(chrom, p1, v1, v2)
+                        # pre_p = p1
+                        # call for the next p1, v1, p2, v2.
+                        p1 = p1n()
+                        v1 = v1n()
+                        p2 = p2n()
+                        v2 = v2n()
+            except StopIteration:
+                # meet the end of either bedGraphTrackI, simply exit
+                pass
+
+        ret.finalize()
+        # ret.merge_regions()
+        return ret
+
+    @cython.ccall
+    def cutoff_analysis(self,
+                        max_gap: cython.int,
+                        min_length: cython.int,
+                        steps: cython.int = 100,
+                        min_score: cython.float = 0,
+                        max_score: cython.float = 1000) -> str:
+        """
+        Cutoff analysis function for bedGraphTrackI object.
+
+        This function will try all possible cutoff values on the score
+        column to call peaks. Then will give a report of a number of
+        metrics (number of peaks, total length of peaks, average
+        length of peak) at varying score cutoffs. For each score
+        cutoff, the function finds the positions where the score
+        exceeds the cutoff, then groups those positions into "peaks"
+        based on the maximum allowed gap (max_gap) between consecutive
+        positions. If a peak's length exceeds the minimum length
+        (min_length), the peak is counted.
+
+        Parameters
+        ----------
+
+        max_gap : int32_t
+        Maximum allowed gap between consecutive positions above cutoff
+
+        min_length : int32_t Minimum length of peak
+        steps: int32_t
+        It will be used to calculate 'step' to increase from min_v to
+        max_v (see below).
+
+        min_score: float32_t
+        Minimum score for cutoff analysis. Note1: we will take the
+        larger value between the actual minimum value in the BedGraph
+        and min_score as min_v. Note2: the min_v won't be included in
+        the final result. We will try to output the smallest cutoff as
+        min_v+step.
+
+        max_score: float32_t
+        Maximum score for cutoff analysis. Note1: we will take the
+        smaller value between the actual maximum value in the BedGraph
+        and max_score as max_v. Note2: the max_v may not be included
+        in the final result. We will only output the cutoff that can
+        generate at least 1 peak.
+
+        Returns
+        -------
+
+        Cutoff analysis report in str object.
+
+        Todos
+        -----
+
+        May need to separate this function out as a class so that we
+        can add more ways to analyze the result. Also, we can let this
+        function return a list of dictionary or data.frame in that
+        way, instead of str object.
+        """
+        chrs: set
+        peak_content: list
+        ret_list: list
+        cutoff_list: list
+        cutoff_npeaks: list
+        cutoff_lpeaks: list
+        chrom: bytes
+        ret: str
+        cutoff: cython.float
+        total_l: cython.long
+        total_p: cython.long
+        i: cython.long
+        n: cython.long
+        ts: cython.long
+        te: cython.long
+        lastp: cython.long
+        tl: cython.long
+        peak_length: cython.long
+        # dict cutoff_npeaks, cutoff_lpeaks
+        s: cython.float
+
+        chrs = self.get_chr_names()
+
+        # midvalue = self.minvalue/2 + self.maxvalue/2
+        # s = float(self.minvalue - midvalue)/steps
+        minv = max(min_score, self.minvalue)
+        maxv = min(self.maxvalue, max_score)
+
+        s = float(maxv - minv)/steps
+
+        # a list of possible cutoff values from minv to maxv with step of s
+        cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
+
+        cutoff_npeaks = [0] * len(cutoff_list)
+        cutoff_lpeaks = [0] * len(cutoff_list)
+
+        for chrom in sorted(chrs):
+            (pos_array, score_array) = self.__data[chrom]
+            pos_array = np.array(self.__data[chrom][0])
+            score_array = np.array(self.__data[chrom][1])
+
+            for n in range(len(cutoff_list)):
+                cutoff = cutoff_list[n]
+                total_l = 0           # total length of peaks
+                total_p = 0           # total number of peaks
+
+                # get the regions with scores above cutoffs. This is
+                # not an optimized method. It would be better to store
+                # score array in a 2-D ndarray?
+                above_cutoff = np.nonzero(score_array > cutoff)[0]
+                # end positions of regions where score is above cutoff
+                above_cutoff_endpos = pos_array[above_cutoff]
+                # start positions of regions where score is above cutoff
+                above_cutoff_startpos = pos_array[above_cutoff-1]
+
+                if above_cutoff_endpos.size == 0:
+                    continue
+
+                # first bit of region above cutoff
+                acs_next = iter(above_cutoff_startpos).__next__
+                ace_next = iter(above_cutoff_endpos).__next__
+
+                ts = acs_next()
+                te = ace_next()
+                peak_content = [(ts, te),]
+                lastp = te
+
+                for i in range(1, above_cutoff_startpos.size):
+                    ts = acs_next()
+                    te = ace_next()
+                    tl = ts - lastp
+                    if tl <= max_gap:
+                        peak_content.append((ts, te))
+                    else:
+                        peak_length = peak_content[-1][1] - peak_content[0][0]
+                        # if the peak is too small, reject it
+                        if peak_length >= min_length:
+                            total_l += peak_length
+                            total_p += 1
+                        peak_content = [(ts, te),]
+                    lastp = te
+
+                if peak_content:
+                    peak_length = peak_content[-1][1] - peak_content[0][0]
+                    # if the peak is too small, reject it
+                    if peak_length >= min_length:
+                        total_l += peak_length
+                        total_p += 1
+                cutoff_lpeaks[n] += total_l
+                cutoff_npeaks[n] += total_p
+
+        # prepare the returnning text
+        ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
+        for n in range(len(cutoff_list)-1, -1, -1):
+            cutoff = cutoff_list[n]
+            if cutoff_npeaks[n] > 0:
+                ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff,
+                                                          cutoff_npeaks[n],
+                                                          cutoff_lpeaks[n],
+                                                          cutoff_lpeaks[n]/cutoff_npeaks[n]))
+        ret = ''.join(ret_list)
+        return ret
+
+
+@cython.cclass
+class bedGraphTrackII:
+    """Class for bedGraph type data.
+
+    In bedGraph, data are represented as continuous non-overlapping
+    regions in the whole genome. I keep this assumption in all the
+    functions. If data has overlaps, some functions will definitely
+    give incorrect results.
+
+    1. Continuous: the next region should be after the previous one
+    unless they are on different chromosomes;
+
+    2. Non-overlapping: the next region should never have overlaps
+    with preceding region.
+
+    The way to memorize bedGraph data is to remember the transition
+    points together with values of their preceding regions. The last
+    data point may exceed chromosome end, unless a chromosome
+    dictionary is given. Remember the coordinations in bedGraph and
+    this class is 0-indexed and right-open.
+
+    Different with bedGraphTrackI, we use numpy array to store the
+    (end) positions and values.
+
+    """
+    __data: dict
+    maxvalue = cython.declare(cython.float, visibility="public")
+    minvalue = cython.declare(cython.float, visibility="public")
+    baseline_value = cython.declare(cython.float, visibility="public")
+    buffer_size: int
+    __size: dict
+
+    def __init__(self,
+                 baseline_value: cython.float = 0,
+                 buffer_size: cython.int = 100000):
+        """
+        baseline_value is the value to fill in the regions not defined
+        in bedGraph. For example, if the bedGraph is like:
+
+        chr1  100 200  1
+        chr1  250 350  2
+
+        Then the region chr1:200..250 should be filled with baseline_value.
+
+        """
+        self.__data = {}
+        self.__size = {}
+        self.maxvalue = -10000000  # initial maximum value is tiny since I want safe_add_loc to update it
+        self.minvalue = 10000000  # initial minimum value is large since I want safe_add_loc to update it
+        self.baseline_value = baseline_value
+        self.buffer_size = 100000
+
+    @cython.ccall
+    def add_loc(self, chromosome: bytes,
+                startpos: cython.int,
+                endpos: cython.int,
+                value: cython.float):
+        """Add a chr-start-end-value block into __data dictionary.
+
+        Note, we don't check if the add_loc is called continuously on
+        sorted regions without any gap. So we only suggest calling
+        this function within MACS.
+
+        """
+        pre_v: cython.float
+        c: cnp.ndarray
+        i: cython.int
+
+        # basic assumption, end pos should > start pos
+
+        if endpos <= 0:
+            return
+        if startpos < 0:
+            startpos = 0
+
+        if chromosome not in self.__data:
+            i = 0
+            # first element in the chromosome
+            self.__data[chromosome] = np.zeros(shape=self.buffer_size,
+                                               dtype=[('p', 'u4'), ('v', 'f4')])
+            c = self.__data[chromosome]
+            if startpos > 0:
+                # start pos is not 0, then add two blocks, the first
+                # with "baseline_value"; the second with "value"
+                c[0] = (startpos, self.baseline_value)
+                i += 1
+
+            c[i] = (endpos, value)
+        else:
+            c = self.__data[chromosome]
+            i = self.__size[chromosome]
+            # get the preceding region
+            pre_v = c[i-1][1]  # which is quicker? c[i-1][1] or c["v"][i-1]?
+
+            # if this region is next to the previous one.
+            if pre_v == value:
+                # if value is the same, simply extend it.
+                c[i-1][0] = endpos
+            else:
+                if i % self.buffer_size == 0:
+                    self.__data[chromosome].resize(i+self.buffer_size,
+                                                   refcheck=False)
+                # otherwise, add a new region
+                c[i] = (endpos, value)
+                i += 1
+
+        self.__size[chromosome] = i
+
+    @cython.ccall
+    def add_loc_wo_merge(self, chromosome: bytes,
+                         startpos: cython.int,
+                         endpos: cython.int,
+                         value: cython.float):
+        """Add a chr-start-end-value block into __data dictionary.
+
+        Note, we don't check if the add_loc is called continuously on
+        sorted regions without any gap. So we only suggest calling
+        this function within MACS.
+
+        This one won't merge nearby ranges with the same value
+        """
+        c: cnp.ndarray
+        i: cython.int
+
+        # basic assumption, end pos should > start pos
+
+        if endpos <= 0:
+            return
+        if startpos < 0:
+            startpos = 0
+
+        if chromosome not in self.__data:
+            i = 0
+            # first element in the chromosome
+            self.__data[chromosome] = np.zeros(shape=self.buffer_size,
+                                               dtype=[('p', 'u4'), ('v', 'f4')])
+            c = self.__data[chromosome]
+            if startpos > 0:
+                # start pos is not 0, then add two blocks, the first
+                # with "baseline_value"; the second with "value"
+                c[0] = (startpos, self.baseline_value)
+                i += 1
+
+            c[i] = (endpos, value)
+        else:
+            c = self.__data[chromosome]
+            i = self.__size[chromosome]
+
+            if i % self.buffer_size == 0:
+                self.__data[chromosome].resize(i+self.buffer_size,
+                                               refcheck=False)
+            # otherwise, add a new region
+            c[i] = (endpos, value)
+            i += 1
+
+        self.__size[chromosome] = i
+
+    @cython.ccall
+    def add_chrom_data(self,
+                       chromosome: bytes,
+                       pv: cnp.ndarray):
+        """Add a pv data to a chromosome. Replace the previous data.
+
+        This is a kinda silly function to waste time and convert a PV
+        array (2-d named numpy array) into two python arrays for this
+        BedGraph class. May have better function later.
+
+        Note: no checks for error, use with caution
+        """
+        self.__data[chromosome] = pv
+        self.__size[chromosome] = len(pv)
+
+        return
+
+    @cython.ccall
+    def destroy(self) -> bool:
+        """ destroy content, free memory.
+        """
+        chrs: set
+        chrom: bytes
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            if chrom in self.__data:
+                self.__data[chrom].resize(self.buffer_size,
+                                          refcheck=False)
+                self.__data[chrom].resize(0,
+                                          refcheck=False)
+                self.__data[chrom] = None
+                self.__data.pop(chrom)
+                self.__size[chrom] = 0
+        return True
+
+    @cython.ccall
+    def finalize(self):
+        """Resize np arrays.
+
+        Note: If this function is called, please do not add any more
+        data. remember to call it after all the files are read!
+
+        """
+        c: bytes
+        chrnames: set
+        maxv: cython.float
+        minv: cython.float
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.__data[c].resize((self.__size[c]), refcheck=False)
+            self.__data[c].sort(order=['p'])
+
+            minv = self.__data[c]['v'].min()
+            maxv = self.__data[c]['v'].max()
+            if maxv > self.maxvalue:
+                self.maxvalue = maxv
+            if minv < self.minvalue:
+                self.minvalue = minv
+        return
+
+    @cython.ccall
+    def get_data_by_chr(self, chromosome: bytes) -> cnp.ndarray:
+        """Return array of counts by chromosome.
+
+        The return value is a tuple:
+        ([end pos],[value])
+        """
+        if chromosome in self.__data:
+            return self.__data[chromosome]
+        else:
+            return None
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names stored.
+
+        """
+        return set(sorted(self.__data.keys()))
+
+    @cython.ccall
+    def filter_score(self, cutoff: cython.float = 0) -> bool:
+        """Filter using a score cutoff. Any region lower than score
+        cutoff will be set to self.baseline_value.
+
+        Self will be modified.
+        """
+        # new_pre_pos: cython.int
+        chrom: bytes
+        chrs: set
+        d: cnp.ndarray
+        maxv: cython.float
+        minv: cython.float
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            d = self.__data[chrom]
+            d = d[d['v'] > cutoff]
+            self.__data[chrom] = d
+            self.__size[chrom] = len(d)
+            minv = d['v'].min()
+            maxv = d['v'].max()
+            if maxv > self.maxvalue:
+                self.maxvalue = maxv
+            if minv < self.minvalue:
+                self.minvalue = minv
+        return True
+
+    @cython.ccall
+    def summary(self) -> tuple:
+        """Calculate the sum, total_length, max, min, mean, and std.
+
+        Return a tuple for (sum, total_length, max, min, mean, std).
+
+        """
+        d: cnp.ndarray
+        n_v: cython.long
+        sum_v: cython.float
+        max_v: cython.float
+        min_v: cython.float
+        mean_v: cython.float
+        variance: cython.float
+        tmp: cython.float
+        std_v: cython.float
+        pre_p: cython.int
+        ln: cython.int
+        i: cython.int
+
+        pre_p = 0
+        n_v = 0
+        sum_v = 0
+        max_v = -100000
+        min_v = 100000
+        for d in self.__data.values():
+            # for each chromosome
+            pre_p = 0
+            for i in range(len(d)):
+                # for each region
+                ln = d[i][0]-pre_p
+                sum_v += d[i][1]*ln
+                n_v += ln
+                pre_p = d[i][0]
+            max_v = max(max(d["v"]), max_v)
+            min_v = min(min(d["v"]), min_v)
+        mean_v = sum_v/n_v
+        variance = 0.0
+        for d in self.__data.values():
+            for i in range(len(d)):
+                # for each region
+                tmp = d[i][1]-mean_v
+                ln = d[i][0]-pre_p
+                variance += tmp*tmp*ln
+                pre_p = d[i][0]
+
+        variance /= float(n_v-1)
+        std_v = sqrt(variance)
+        return (sum_v, n_v, max_v, min_v, mean_v, std_v)
+
+    @cython.ccall
+    def call_peaks(self,
+                   cutoff: cython.float = 1.0,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 50,
+                   call_summits: bool = False):
+        """This function try to find regions within which, scores
+        are continuously higher than a given cutoff.
+
+        """
+        i: cython.int
+        chrom: bytes
+        pos: cnp.ndarray
+        value: cnp.ndarray
+        above_cutoff: cnp.ndarray(dtype="bool", ndim=1)
+        above_cutoff_v: cnp.ndarray
+        above_cutoff_endpos: cnp.ndarray
+        above_cutoff_startpos: cnp.ndarray
+        peak_content: list
+
+        chrs = self.get_chr_names()
+        peaks = PeakIO()                      # dictionary to save peaks
+
+        for chrom in sorted(chrs):
+            peak_content = []           # to store points above cutoff
+            pos = self.__data[chrom]['p']
+            value = self.__data[chrom]['v']
+
+            above_cutoff = value >= cutoff
+            # scores where score is above cutoff
+            above_cutoff_v = value[above_cutoff]
+            # end positions of regions where score is above cutoff
+            above_cutoff_endpos = pos[above_cutoff]
+            # start positions of regions where score is above cutoff
+            above_cutoff_startpos = pos[np.roll(above_cutoff, -1)]
+
+            if above_cutoff_v.size == 0:
+                # nothing above cutoff
+                continue
+
+            if above_cutoff[0] is True:
+                # first element > cutoff, fix the first point as 0. otherwise it would be the last item in __data[chrom]['p']
+                above_cutoff_startpos[0] = 0
+
+            # first bit of region above cutoff
+            peak_content.append((above_cutoff_startpos[0], above_cutoff_endpos[0], above_cutoff_v[0]))
+            for i in range(1, above_cutoff_startpos.size):
+                if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap:
+                    # append
+                    peak_content.append((above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i]))
+                else:
+                    # close
+                    self.__close_peak(peak_content,
+                                      peaks,
+                                      min_length,
+                                      chrom)
+                    peak_content = [(above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i]),]
+
+            # save the last peak
+            if not peak_content:
+                continue
+            else:
+                self.__close_peak(peak_content,
+                                  peaks,
+                                  min_length,
+                                  chrom)
+
+        return peaks
+
+    @cython.cfunc
+    def __close_peak(self,
+                     peak_content: list,
+                     peaks,
+                     min_length: cython.int,
+                     chrom: bytes) -> bool:
+        tsummit: list           # list for temporary summits
+        peak_length: cython.int
+        summit: cython.int
+        tstart: cython.int
+        tend: cython.int
+        summit_value: cython.float
+        tvalue: cython.float
+        peak_length = peak_content[-1][1]-peak_content[0][0]
+        if peak_length >= min_length:  # if the peak is too small, reject it
+            tsummit = []
+            summit = 0
+            summit_value = 0
+            for (tstart, tend, tvalue) in peak_content:
+                if not summit_value or summit_value < tvalue:
+                    tsummit = [cython.cast(cython.int, (tend+tstart)/2),]
+                    summit_value = tvalue
+                elif summit_value == tvalue:
+                    tsummit.append(cython.cast(cython.int, (tend+tstart)/2))
+            summit = tsummit[cython.cast(cython.int, (len(tsummit)+1)/2)-1]
+            peaks.add(chrom,
+                      peak_content[0][0],
+                      peak_content[-1][1],
+                      summit=summit,
+                      peak_score=summit_value,
+                      pileup=0,
+                      pscore=0,
+                      fold_change=0,
+                      qscore=0
+                      )
+            return True
+
+    @cython.ccall
+    def call_broadpeaks(self,
+                        lvl1_cutoff: cython.float = 500,
+                        lvl2_cutoff: cython.float = 100,
+                        min_length: cython.int = 200,
+                        lvl1_max_gap: cython.int = 50,
+                        lvl2_max_gap: cython.int = 400):
+        """This function try to find enriched regions within which,
+        scores are continuously higher than a given cutoff for level
+        1, and link them using the gap above level 2 cutoff with a
+        maximum length of lvl2_max_gap.
+
+        lvl1_cutoff:  cutoff of value at enriched regions, default 500.
+        lvl2_cutoff:  cutoff of value at linkage regions, default 100.
+        min_length :  minimum peak length, default 200.
+        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
+        lvl2_max_gap   :  maximum length of linkage regions, default 400.
+        colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
+
+        Return both general PeakIO object for highly enriched regions
+        and gapped broad regions in BroadPeakIO.
+        """
+        chrom: bytes
+        i: cython.int
+        j: cython.int
+        chrs: set
+        lvl1: PeakContent
+        lvl2: PeakContent   # PeakContent class object
+        lvl1peakschrom: list
+        lvl2peakschrom: list
+
+        assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
+        assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
+        lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl1_max_gap,
+                                     call_summits=False)
+        lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl2_max_gap,
+                                     call_summits=False)
+        chrs = lvl1_peaks.get_chr_names()
+        broadpeaks = BroadPeakIO()
+        # use lvl2_peaks as linking regions between lvl1_peaks
+        for chrom in sorted(chrs):
+            lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom)
+            lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom)
+            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
+            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
+            # our assumption is lvl1 regions should be included in lvl2 regions
+            try:
+                lvl1 = lvl1peakschrom_next()
+                for i in range(len(lvl2peakschrom)):
+                    # for each lvl2 peak, find all lvl1 peaks inside
+                    lvl2 = lvl2peakschrom[i]
+                    while True:
+                        if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
+                            tmppeakset.append(lvl1)
+                            lvl1 = lvl1peakschrom_next()
+                        else:
+                            self.__add_broadpeak(broadpeaks,
+                                                 chrom,
+                                                 lvl2,
+                                                 tmppeakset)
+                            tmppeakset = []
+                            break
+            except StopIteration:
+                self.__add_broadpeak(broadpeaks, chrom, lvl2, tmppeakset)
+                tmppeakset = []
+                for j in range(i+1, len(lvl2peakschrom)):
+                    self.__add_broadpeak(broadpeaks,
+                                         chrom,
+                                         lvl2peakschrom[j],
+                                         tmppeakset)
+        return broadpeaks
+
+    @cython.cfunc
+    def __add_broadpeak(self,
+                        bpeaks,
+                        chrom: bytes,
+                        lvl2peak: PeakContent,
+                        lvl1peakset: list):
+        """Internal function to create broad peak.
+        """
+        start: cython.int
+        end: cython.int
+        blockNum: cython.int
+        blockSizes: bytes
+        blockStarts: bytes
+        thickStart: bytes
+        thickEnd: bytes
+
+        start = lvl2peak["start"]
+        end = lvl2peak["end"]
+
+        # the following code will add those broad/lvl2 peaks with no
+        # strong/lvl1 peaks inside
+        if not lvl1peakset:
+            # try:
+            # will complement by adding 1bps start and end to this region
+            # may change in the future if gappedPeak format was improved.
+            bpeaks.add(chrom, start, end,
+                       score=lvl2peak["score"],
+                       thickStart=(b"%d" % start),
+                       thickEnd=(b"%d" % end),
+                       blockNum=2,
+                       blockSizes=b"1,1",
+                       blockStarts=(b"0,%d" % (end-start-1)),
+                       pileup=lvl2peak["pileup"],
+                       pscore=lvl2peak["pscore"],
+                       fold_change=lvl2peak["fc"],
+                       qscore=lvl2peak["qscore"])
+            return bpeaks
+
+        thickStart = b"%d" % lvl1peakset[0]["start"]
+        thickEnd = b"%d" % lvl1peakset[-1]["end"]
+        blockNum = len(lvl1peakset)
+        blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
+        blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
+
+        if int(thickStart) != start:
+            # add 1bp left block
+            thickStart = b"%d" % start
+            blockNum += 1
+            blockSizes = b"1,"+blockSizes
+            blockStarts = b"0,"+blockStarts
+        if int(thickEnd) != end:
+            # add 1bp right block
+            thickEnd = b"%d" % end
+            blockNum += 1
+            blockSizes = blockSizes+b",1"
+            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
+
+        bpeaks.add(chrom, start, end,
+                   score=lvl2peak["score"],
+                   thickStart=thickStart,
+                   thickEnd=thickEnd,
+                   blockNum=blockNum,
+                   blockSizes=blockSizes,
+                   blockStarts=blockStarts,
+                   pileup=lvl2peak["pileup"],
+                   pscore=lvl2peak["pscore"],
+                   fold_change=lvl2peak["fc"],
+                   qscore=lvl2peak["qscore"])
+        return bpeaks
+
+    @cython.ccall
+    def refine_peaks(self, peaks):
+        """This function try to based on given peaks, re-evaluate the
+        peak region, call the summit.
+
+        peaks: PeakIO object
+        return: a new PeakIO object
+
+        """
+        pre_p: cython.int
+        p: cython.int
+        peak_s: cython.int
+        peak_e: cython.int
+        v: cython.float
+        chrom: bytes
+        chrs: set
+
+        peaks.sort()
+        new_peaks = PeakIO()
+        chrs = self.get_chr_names()
+        assert isinstance(peaks, PeakIO)
+        chrs = chrs.intersection(set(peaks.get_chr_names()))
+
+        for chrom in sorted(chrs):
+            peaks_chr = peaks.get_data_from_chrom(chrom)
+            peak_content = []
+            # arrays for position and values
+            (ps, vs) = self.get_data_by_chr(chrom)
+            # assign the next function to a viable to speed up
+            psn = iter(ps).__next__
+            vsn = iter(vs).__next__
+            peakn = iter(peaks_chr).__next__
+
+            # remember previous position in bedgraph/self
+            pre_p = 0
+            p = psn()
+            v = vsn()
+            peak = peakn()
+            peak_s = peak["start"]
+            peak_e = peak["end"]
+            while True:
+                # look for overlap
+                if p > peak_s and peak_e > pre_p:
+                    # now put four coordinates together and pick the middle two
+                    s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
+                    # add this content
+                    peak_content.append((s, e, v))
+                    # move self/bedGraph
+                    try:
+                        pre_p = p
+                        p = psn()
+                        v = vsn()
+                    except Exception:
+                        # no more value chunk in bedGraph
+                        break
+                elif pre_p >= peak_e:
+                    # close peak
+                    self.__close_peak(peak_content, new_peaks, 0, chrom)
+                    peak_content = []
+                    # move peak
+                    try:
+                        peak = peakn()
+                        peak_s = peak["start"]
+                        peak_e = peak["end"]
+                    except Exception:
+                        # no more peak
+                        break
+                elif peak_s >= p:
+                    # move self/bedgraph
+                    try:
+                        pre_p = p
+                        p = psn()
+                        v = vsn()
+                    except Exception:
+                        # no more value chunk in bedGraph
+                        break
+                else:
+                    raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
+
+            # save the last peak
+            if peak_content:
+                self.__close_peak(peak_content, new_peaks, 0, chrom)
+        return new_peaks
+
+    @cython.ccall
+    def total(self) -> cython.int:
+        """Return the number of regions in this object.
+
+        """
+        t: cython.int
+        d: cnp.ndarray
+        
+        t = 0
+        for d in self.__data.values():
+            t += len(d)
+        return t
+
+    # @cython.ccall
+    # def set_single_value(self, new_value: cython.float):
+    #     """Change all the values in bedGraph to the same new_value,
+    #     return a new bedGraphTrackI.
+
+    #     """
+    #     chrom: bytes
+    #     max_p: cython.int
+
+    #     ret = bedGraphTrackI()
+    #     chroms = set(self.get_chr_names())
+    #     for chrom in sorted(chroms):
+    #         # arrays for position and values
+    #         (p1, v1) = self.get_data_by_chr(chrom)
+    #         # maximum p
+    #         max_p = max(p1)
+    #         # add a region from 0 to max_p
+    #         ret.add_loc(chrom, 0, max_p, new_value)
+    #     return ret
+
+    # @cython.ccall
+    # def overlie(self, bdgTracks, func: str = "max"):
+    #     """Calculate two or more bedGraphTrackI objects by letting self
+    #     overlying bdgTrack2, with user-defined functions.
+
+    #     Transition positions from both bedGraphTrackI objects will be
+    #     considered and combined. For example:
+
+    #        #1 bedGraph (self)   |      #2 bedGraph
+    #     -----------------------------------------------
+    #     chr1  0    100  0       | chr1    0    150  1
+    #     chr1  100  200  3       | chr1    150  250  2
+    #     chr1  200  300  4       | chr1    250  300  4
+
+    #     these two bedGraphs will be combined to have five transition
+    #     points: 100, 150, 200, 250, and 300. So in order to calculate
+    #     two bedGraphs, I pair values within the following regions
+    #     like:
+
+    #     chr   s   e     (#1,#2)   applied_func_max
+    #     -----------------------------------------------
+    #     chr1  0   100   (0,1)     1
+    #     chr1  100 150   (3,1)     3
+    #     chr1  150 200   (3,2)     3
+    #     chr1  200 250   (4,2)     4
+    #     chr1  250 300   (4,4)     4
+
+    #     Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
+
+    #     Supported 'func' are "sum", "subtract" (only for two bdg
+    #     objects), "product", "divide" (only for two bdg objects),
+    #     "max", "mean" and "fisher".
+
+    #     Return value is a new bedGraphTrackI object.
+
+    #     Option: bdgTracks can be a list of bedGraphTrackI objects
+    #     """
+    #     pre_p: cython.int
+    #     chrom: bytes
+
+    #     nr_tracks = len(bdgTracks) + 1  # +1 for self
+    #     assert nr_tracks >= 2, "Specify at least one more bdg objects."
+    #     for i, bdgTrack in enumerate(bdgTracks):
+    #         assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1)
+
+    #     if func == "max":
+    #         f = max
+    #     elif func == "mean":
+    #         f = mean_func
+    #     elif func == "fisher":
+    #         f = fisher_func
+    #     elif func == "sum":
+    #         f = sum
+    #     elif func == "product":
+    #         f = product_func
+    #     elif func == "subtract":
+    #         if nr_tracks == 2:
+    #             f = subtract_func
+    #         else:
+    #             raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
+    #     elif func == "divide":
+    #         if nr_tracks == 2:
+    #             f = divide_func
+    #         else:
+    #             raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
+    #     else:
+    #         raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
+
+    #     ret = bedGraphTrackI()
+
+    #     common_chr = set(self.get_chr_names())
+    #     for track in bdgTracks:
+    #         common_chr = common_chr.intersection(set(track.get_chr_names()))
+
+    #     for chrom in sorted(common_chr):
+    #         datas = [self.get_data_by_chr(chrom)]
+    #         datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))])
+
+    #         ps, vs, pn, vn = [], [], [], []
+    #         for data in datas:
+    #             ps.append(data[0])
+    #             pn.append(iter(ps[-1]).__next__)
+    #             vs.append(data[1])
+    #             vn.append(iter(vs[-1]).__next__)
+
+    #         pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
+    #         try:
+    #             ps_cur = [pn[i]() for i in range(len(pn))]
+    #             vs_cur = [vn[i]() for i in range(len(pn))]
+
+    #             while True:
+    #                 # get the lowest position
+    #                 lowest_p = min(ps_cur)
+
+    #                 # at least one lowest position, could be multiple
+    #                 locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p]
+
+    #                 # add the data until the interval
+    #                 ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur))
+
+    #                 pre_p = ps_cur[locations[0]]
+    #                 for index in locations:
+    #                     ps_cur[index] = pn[index]()
+    #                     vs_cur[index] = vn[index]()
+    #         except StopIteration:
+    #             # meet the end of either bedGraphTrackI, simply exit
+    #             pass
+    #     return ret
+
+    # @cython.ccall
+    # def apply_func(self, func) -> bool:
+    #     """Apply function 'func' to every value in this bedGraphTrackI object.
+
+    #     *Two adjacent regions with same value after applying func will
+    #     not be merged.
+    #     """
+    #     i: cython.int
+
+    #     for (p, s) in self.__data.values():
+    #         for i in range(len(s)):
+    #             s[i] = func(s[i])
+    #     self.maxvalue = func(self.maxvalue)
+    #     self.minvalue = func(self.minvalue)
+    #     return True
+
+    # @cython.ccall
+    # def p2q(self):
+    #     """Convert pvalue scores to qvalue scores.
+
+    #     *Assume scores in this bedGraph are pvalue scores! Not work
+    #      for other type of scores.
+    #     """
+    #     chrom: bytes
+    #     pos_array: pyarray
+    #     pscore_array: pyarray
+    #     pvalue_stat: dict = {}
+    #     pqtable: dict = {}
+    #     pre_p: cython.long
+    #     this_p: cython.long
+    #     # pre_l: cython.long
+    #     # l: cython.long
+    #     i: cython.long
+    #     nhcal: cython.long = 0
+    #     N: cython.long
+    #     k: cython.long
+    #     this_l: cython.long
+    #     this_v: cython.float
+    #     # pre_v: cython.float
+    #     v: cython.float
+    #     q: cython.float
+    #     pre_q: cython.float
+    #     f: cython.float
+    #     unique_values: list
+
+    #     # calculate frequencies of each p-score
+    #     for chrom in sorted(self.get_chr_names()):
+    #         pre_p = 0
+
+    #         [pos_array, pscore_array] = self.__data[chrom]
+
+    #         pn = iter(pos_array).__next__
+    #         vn = iter(pscore_array).__next__
+
+    #         for i in range(len(pos_array)):
+    #             this_p = pn()
+    #             this_v = vn()
+    #             this_l = this_p - pre_p
+    #             if this_v in pvalue_stat:
+    #                 pvalue_stat[this_v] += this_l
+    #             else:
+    #                 pvalue_stat[this_v] = this_l
+    #             pre_p = this_p
+
+    #         # nhcal += len(pos_array)
+
+    #     # nhval = 0
+
+    #     N = sum(pvalue_stat.values())  # total length
+    #     k = 1                          # rank
+    #     f = -log10(N)
+    #     # pre_v = -2147483647
+    #     # pre_l = 0
+    #     pre_q = 2147483647              # save the previous q-value
+
+    #     # calculate qscore for each pscore
+    #     pqtable = {}
+    #     unique_values = sorted(pvalue_stat.keys(), reverse=True)
+    #     for i in range(len(unique_values)):
+    #         v = unique_values[i]
+    #         # l = pvalue_stat[v]
+    #         q = v + (log10(k) + f)
+    #         q = max(0, min(pre_q, q))           # make q-score monotonic
+    #         pqtable[v] = q
+    #         # pre_v = v
+    #         pre_q = q
+    #         # k += l
+    #         nhcal += 1
+
+    #     # convert pscore to qscore
+    #     for chrom in sorted(self.get_chr_names()):
+    #         [pos_array, pscore_array] = self.__data[chrom]
+
+    #         for i in range(len(pos_array)):
+    #             pscore_array[i] = pqtable[pscore_array[i]]
+
+    #     self.merge_regions()
+    #     return
+
+    # @cython.ccall
+    # def extract_value(self, bdgTrack2):
+    #     """Extract values from regions defined in bedGraphTrackI class object
+    #     `bdgTrack2`.
+
+    #     """
+    #     pre_p: cython.int
+    #     p1: cython.int
+    #     p2: cython.int
+    #     i: cython.int
+    #     v1: cython.float
+    #     v2: cython.float
+    #     chrom: bytes
+
+    #     assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+    #     # 1: region in bdgTrack2; 2: value; 3: length with the value
+    #     ret = [[], pyarray('f', []), pyarray('L', [])]
+    #     radd = ret[0].append
+    #     vadd = ret[1].append
+    #     ladd = ret[2].append
+
+    #     chr1 = set(self.get_chr_names())
+    #     chr2 = set(bdgTrack2.get_chr_names())
+    #     common_chr = chr1.intersection(chr2)
+    #     for i in range(len(common_chr)):
+    #         chrom = common_chr.pop()
+    #         (p1s, v1s) = self.get_data_by_chr(chrom)  # arrays for position and values
+    #         # assign the next function to a viable to speed up
+    #         p1n = iter(p1s).__next__
+    #         v1n = iter(v1s).__next__
+
+    #         # arrays for position and values
+    #         (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+    #         # assign the next function to a viable to speed up
+    #         p2n = iter(p2s).__next__
+    #         v2n = iter(v2s).__next__
+    #         # remember the previous position in the new bedGraphTrackI
+    #         # object ret
+    #         pre_p = 0
+    #         try:
+    #             p1 = p1n()
+    #             v1 = v1n()
+
+    #             p2 = p2n()
+    #             v2 = v2n()
+
+    #             while True:
+    #                 if p1 < p2:
+    #                     # clip a region from pre_p to p1, then set pre_p as p1.
+    #                     if v2 > 0:
+    #                         radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+    #                         vadd(v1)
+    #                         ladd(p1-pre_p)
+    #                     pre_p = p1
+    #                     # call for the next p1 and v1
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                 elif p2 < p1:
+    #                     # clip a region from pre_p to p2, then set
+    #                     # pre_p as p2.
+    #                     if v2 > 0:
+    #                         radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
+    #                         vadd(v1)
+    #                         ladd(p2-pre_p)
+    #                     pre_p = p2
+    #                     # call for the next p2 and v2
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #                 elif p1 == p2:
+    #                     # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+    #                     if v2 > 0:
+    #                         radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+    #                         vadd(v1)
+    #                         ladd(p1-pre_p)
+    #                     pre_p = p1
+    #                     # call for the next p1, v1, p2, v2.
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #         except StopIteration:
+    #             # meet the end of either bedGraphTrackI, simply exit
+    #             pass
+
+    #     return ret
+
+    # @cython.ccall
+    # def extract_value_hmmr(self, bdgTrack2):
+    #     """Extract values from regions defined in bedGraphTrackI class object
+    #     `bdgTrack2`.
+
+    #     I will try to tweak this function to output only the values of
+    #     bdgTrack1 (self) in the regions in bdgTrack2
+
+    #     This is specifically for HMMRATAC. bdgTrack2 should be a
+    #     bedgraph object containing the bins with value set to
+    #     'mark_bin' -- the bins in the same region will have the same
+    #     value.
+    #     """
+    #     # pre_p: cython.int
+    #     p1: cython.int
+    #     p2: cython.int
+    #     i: cython.int
+    #     v1: cython.float
+    #     v2: cython.float
+    #     chrom: bytes
+    #     ret: list
+
+    #     assert isinstance(bdgTrack2, bedGraphTrackI), "not a bedGraphTrackI object"
+
+    #     # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
+    #     ret = [[], pyarray('f', []), pyarray('i', [])]
+    #     padd = ret[0].append
+    #     vadd = ret[1].append
+    #     ladd = ret[2].append
+
+    #     chr1 = set(self.get_chr_names())
+    #     chr2 = set(bdgTrack2.get_chr_names())
+    #     common_chr = sorted(list(chr1.intersection(chr2)))
+    #     for i in range(len(common_chr)):
+    #         chrom = common_chr.pop()
+    #         # arrays for position and values
+    #         (p1s, v1s) = self.get_data_by_chr(chrom)
+    #         # assign the next function to a viable to speed up
+    #         p1n = iter(p1s).__next__
+    #         v1n = iter(v1s).__next__
+
+    #         # arrays for position and values
+    #         (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+    #         # assign the next function to a viable to speed up
+    #         p2n = iter(p2s).__next__
+    #         v2n = iter(v2s).__next__
+    #         # remember the previous position in the new bedGraphTrackI
+    #         # object ret
+    #         # pre_p = 0
+    #         try:
+    #             p1 = p1n()
+    #             v1 = v1n()
+
+    #             p2 = p2n()
+    #             v2 = v2n()
+
+    #             while True:
+    #                 if p1 < p2:
+    #                     # clip a region from pre_p to p1, then set pre_p as p1.
+    #                     # in this case, we don't output any
+    #                     # if v2>0:
+    #                     #    radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
+    #                     #    vadd(v1)
+    #                     #    ladd(p1-pre_p)
+    #                     # pre_p = p1
+    #                     # call for the next p1 and v1
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                 elif p2 < p1:
+    #                     # clip a region from pre_p to p2, then set pre_p as p2.
+    #                     if v2 != 0:  # 0 means it's a gap region, we should have value > 1
+    #                         padd((chrom, p2))
+    #                         vadd(v1)
+    #                         ladd(int(v2))
+    #                     # pre_p = p2
+    #                     # call for the next p2 and v2
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #                 elif p1 == p2:
+    #                     # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+    #                     if v2 != 0: # 0 means it's a gap region, we should have 1 or -1
+    #                         padd((chrom, p2))
+    #                         vadd(v1)
+    #                         ladd(int(v2))
+    #                     # pre_p = p1
+    #                     # call for the next p1, v1, p2, v2.
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #         except StopIteration:
+    #             # meet the end of either bedGraphTrackI, simply exit
+    #             pass
+
+    #     return ret
+
+    # @cython.ccall
+    # def make_ScoreTrackII_for_macs(self, bdgTrack2,
+    #                                depth1: float = 1.0,
+    #                                depth2: float = 1.0):
+    #     """A modified overlie function for MACS v2.
+
+    #     effective_depth_in_million: sequencing depth in million after
+    #                                 duplicates being filtered. If
+    #                                 treatment is scaled down to
+    #                                 control sample size, then this
+    #                                 should be control sample size in
+    #                                 million. And vice versa.
+
+    #     Return value is a ScoreTrackII object.
+    #     """
+    #     # pre_p: cython.int
+    #     p1: cython.int
+    #     p2: cython.int
+    #     v1: cython.float
+    #     v2: cython.float
+    #     chrom: bytes
+
+    #     assert isinstance(bdgTrack2, bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
+
+    #     ret = ScoreTrackII(treat_depth=depth1,
+    #                        ctrl_depth=depth2)
+    #     retadd = ret.add
+
+    #     chr1 = set(self.get_chr_names())
+    #     chr2 = set(bdgTrack2.get_chr_names())
+    #     common_chr = chr1.intersection(chr2)
+    #     for chrom in sorted(common_chr):
+    #         # arrays for position and values
+    #         (p1s, v1s) = self.get_data_by_chr(chrom)
+    #         # assign the next function to a viable to speed up
+    #         p1n = iter(p1s).__next__
+    #         v1n = iter(v1s).__next__
+    #         # arrays for position and values
+    #         (p2s, v2s) = bdgTrack2.get_data_by_chr(chrom)
+    #         # assign the next function to a viable to speed up
+    #         p2n = iter(p2s).__next__
+    #         v2n = iter(v2s).__next__
+
+    #         # this is the maximum number of locations needed to be
+    #         # recorded in scoreTrackI for this chromosome.
+    #         chrom_max_len = len(p1s)+len(p2s)
+
+    #         ret.add_chromosome(chrom, chrom_max_len)
+
+    #         # remember the previous position in the new bedGraphTrackI
+    #         # object ret
+    #         # pre_p = 0
+
+    #         try:
+    #             p1 = p1n()
+    #             v1 = v1n()
+
+    #             p2 = p2n()
+    #             v2 = v2n()
+
+    #             while True:
+    #                 if p1 < p2:
+    #                     # clip a region from pre_p to p1, then set pre_p as p1.
+    #                     retadd(chrom, p1, v1, v2)
+    #                     # pre_p = p1
+    #                     # call for the next p1 and v1
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                 elif p2 < p1:
+    #                     # clip a region from pre_p to p2, then set pre_p as p2.
+    #                     retadd(chrom, p2, v1, v2)
+    #                     # pre_p = p2
+    #                     # call for the next p2 and v2
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #                 elif p1 == p2:
+    #                     # from pre_p to p1 or p2, then set pre_p as p1 or p2.
+    #                     retadd(chrom, p1, v1, v2)
+    #                     # pre_p = p1
+    #                     # call for the next p1, v1, p2, v2.
+    #                     p1 = p1n()
+    #                     v1 = v1n()
+    #                     p2 = p2n()
+    #                     v2 = v2n()
+    #         except StopIteration:
+    #             # meet the end of either bedGraphTrackI, simply exit
+    #             pass
+
+    #     ret.finalize()
+    #     # ret.merge_regions()
+    #     return ret
+
+    # @cython.ccall
+    # def cutoff_analysis(self,
+    #                     max_gap: cython.int,
+    #                     min_length: cython.int,
+    #                     steps: cython.int = 100,
+    #                     min_score: cython.float = 0,
+    #                     max_score: cython.float = 1000) -> str:
+    #     """
+    #     Cutoff analysis function for bedGraphTrackI object.
+
+    #     This function will try all possible cutoff values on the score
+    #     column to call peaks. Then will give a report of a number of
+    #     metrics (number of peaks, total length of peaks, average
+    #     length of peak) at varying score cutoffs. For each score
+    #     cutoff, the function finds the positions where the score
+    #     exceeds the cutoff, then groups those positions into "peaks"
+    #     based on the maximum allowed gap (max_gap) between consecutive
+    #     positions. If a peak's length exceeds the minimum length
+    #     (min_length), the peak is counted.
+
+    #     Parameters
+    #     ----------
+
+    #     max_gap : int32_t
+    #     Maximum allowed gap between consecutive positions above cutoff
+
+    #     min_length : int32_t Minimum length of peak
+    #     steps: int32_t
+    #     It will be used to calculate 'step' to increase from min_v to
+    #     max_v (see below).
+
+    #     min_score: float32_t
+    #     Minimum score for cutoff analysis. Note1: we will take the
+    #     larger value between the actual minimum value in the BedGraph
+    #     and min_score as min_v. Note2: the min_v won't be included in
+    #     the final result. We will try to output the smallest cutoff as
+    #     min_v+step.
+
+    #     max_score: float32_t
+    #     Maximum score for cutoff analysis. Note1: we will take the
+    #     smaller value between the actual maximum value in the BedGraph
+    #     and max_score as max_v. Note2: the max_v may not be included
+    #     in the final result. We will only output the cutoff that can
+    #     generate at least 1 peak.
+
+    #     Returns
+    #     -------
+
+    #     Cutoff analysis report in str object.
+
+    #     Todos
+    #     -----
+
+    #     May need to separate this function out as a class so that we
+    #     can add more ways to analyze the result. Also, we can let this
+    #     function return a list of dictionary or data.frame in that
+    #     way, instead of str object.
+    #     """
+    #     chrs: set
+    #     peak_content: list
+    #     ret_list: list
+    #     cutoff_list: list
+    #     cutoff_npeaks: list
+    #     cutoff_lpeaks: list
+    #     chrom: bytes
+    #     ret: str
+    #     cutoff: cython.float
+    #     total_l: cython.long
+    #     total_p: cython.long
+    #     i: cython.long
+    #     n: cython.long
+    #     ts: cython.long
+    #     te: cython.long
+    #     lastp: cython.long
+    #     tl: cython.long
+    #     peak_length: cython.long
+    #     # dict cutoff_npeaks, cutoff_lpeaks
+    #     s: cython.float
+
+    #     chrs = self.get_chr_names()
+
+    #     # midvalue = self.minvalue/2 + self.maxvalue/2
+    #     # s = float(self.minvalue - midvalue)/steps
+    #     minv = max(min_score, self.minvalue)
+    #     maxv = min(self.maxvalue, max_score)
+
+    #     s = float(maxv - minv)/steps
+
+    #     # a list of possible cutoff values from minv to maxv with step of s
+    #     cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
+
+    #     cutoff_npeaks = [0] * len(cutoff_list)
+    #     cutoff_lpeaks = [0] * len(cutoff_list)
+
+    #     for chrom in sorted(chrs):
+    #         (pos_array, score_array) = self.__data[chrom]
+    #         pos_array = np.array(self.__data[chrom][0])
+    #         score_array = np.array(self.__data[chrom][1])
+
+    #         for n in range(len(cutoff_list)):
+    #             cutoff = cutoff_list[n]
+    #             total_l = 0           # total length of peaks
+    #             total_p = 0           # total number of peaks
+
+    #             # get the regions with scores above cutoffs. This is
+    #             # not an optimized method. It would be better to store
+    #             # score array in a 2-D ndarray?
+    #             above_cutoff = np.nonzero(score_array > cutoff)[0]
+    #             # end positions of regions where score is above cutoff
+    #             above_cutoff_endpos = pos_array[above_cutoff]
+    #             # start positions of regions where score is above cutoff
+    #             above_cutoff_startpos = pos_array[above_cutoff-1]
+
+    #             if above_cutoff_endpos.size == 0:
+    #                 continue
+
+    #             # first bit of region above cutoff
+    #             acs_next = iter(above_cutoff_startpos).__next__
+    #             ace_next = iter(above_cutoff_endpos).__next__
+
+    #             ts = acs_next()
+    #             te = ace_next()
+    #             peak_content = [(ts, te),]
+    #             lastp = te
+
+    #             for i in range(1, above_cutoff_startpos.size):
+    #                 ts = acs_next()
+    #                 te = ace_next()
+    #                 tl = ts - lastp
+    #                 if tl <= max_gap:
+    #                     peak_content.append((ts, te))
+    #                 else:
+    #                     peak_length = peak_content[-1][1] - peak_content[0][0]
+    #                     # if the peak is too small, reject it
+    #                     if peak_length >= min_length:
+    #                         total_l += peak_length
+    #                         total_p += 1
+    #                     peak_content = [(ts, te),]
+    #                 lastp = te
+
+    #             if peak_content:
+    #                 peak_length = peak_content[-1][1] - peak_content[0][0]
+    #                 # if the peak is too small, reject it
+    #                 if peak_length >= min_length:
+    #                     total_l += peak_length
+    #                     total_p += 1
+    #             cutoff_lpeaks[n] += total_l
+    #             cutoff_npeaks[n] += total_p
+
+    #     # prepare the returnning text
+    #     ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
+    #     for n in range(len(cutoff_list)-1, -1, -1):
+    #         cutoff = cutoff_list[n]
+    #         if cutoff_npeaks[n] > 0:
+    #             ret_list.append("%.2f\t%d\t%d\t%.2f\n" % (cutoff,
+    #                                                       cutoff_npeaks[n],
+    #                                                       cutoff_lpeaks[n],
+    #                                                       cutoff_lpeaks[n]/cutoff_npeaks[n]))
+    #     ret = ''.join(ret_list)
+    #     return ret
+
+
+@cython.cfunc
+def calculate_elbows(values: cnp.ndarray,
+                     threshold: cython.float = 0.01) -> cnp.ndarray:
+    # although this function is supposed to find elbow pts for cutoff
+    # analysis, however, in reality, it barely works...
+    deltas: cnp.ndarray
+    slopes: cnp.ndarray
+    delta_slopes: cnp.ndarray
+    elbows: cnp.ndarray
+    avg_delta_slope: cython.float
+
+    # Calculate the difference between each point and the first point
+    deltas = values - values[0]
+    # Calculate the slope between each point and the last point
+    slopes = deltas / (values[-1] - values[0])
+    # Calculate the change in slope
+    delta_slopes = np.diff(slopes)
+    # Calculate the average change in slope
+    avg_delta_slope = np.mean(delta_slopes)
+    # Find all points where the change in slope is significantly
+    # larger than the average
+    elbows = np.where(delta_slopes > avg_delta_slope + threshold)[0]
+    return elbows
diff --git a/MACS3/Signal/BedGraph.pyx b/MACS3/Signal/BedGraph.pyx
deleted file mode 100644
index df57c4f7..00000000
--- a/MACS3/Signal/BedGraph.pyx
+++ /dev/null
@@ -1,1340 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2024-05-15 19:27:06 Tao Liu>
-
-"""Module for BedGraph data class.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-#from array import array
-from cpython cimport array
-from array import array as pyarray
-from math import prod
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.Signal.ScoreTrack import ScoreTrackII
-from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
-from MACS3.Signal.Prob import chisq_logp_e
-
-# ------------------------------------
-# Other modules
-# ------------------------------------
-
-from cpython cimport bool
-import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-
-# ------------------------------------
-# C lib
-# ------------------------------------
-
-from libc.math cimport sqrt, log, log1p, exp, log10
-
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "BedGraph $Revision$"
-__author__ = "Tao Liu <vladimir.liu@gmail.com>"
-__doc__ = "bedGraphTrackI class"
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-LOG10_E = 0.43429448190325176
-
-cdef inline mean_func( x ):
-    return sum( x )/len( x )
-
-cdef inline fisher_func( x ):
-    # combine -log10pvalues
-    return chisq_logp_e( 2*sum (x )/LOG10_E, 2*len( x ), log10=True )
-
-cdef inline subtract_func( x ):
-    # subtraction of two items list
-    return x[1] - x[0]
-
-cdef inline divide_func( x ):
-    # division of two items list
-    return x[1] / x[2]
-
-cdef inline product_func( x ):
-    # production of a list of values
-    # only python 3.8 or above
-    return prod( x )
-    
-# ------------------------------------
-# Classes
-# ------------------------------------
-cdef class bedGraphTrackI:
-    """Class for bedGraph type data.
-
-    In bedGraph, data are represented as continuous non-overlapping
-    regions in the whole genome. I keep this assumption in all the
-    functions. If data has overlaps, some functions will definitely
-    give incorrect results.
-
-    1. Continuous: the next region should be after the previous one
-    unless they are on different chromosomes;
-
-    2. Non-overlapping: the next region should never have overlaps
-    with preceding region.
-
-    The way to memorize bedGraph data is to remember the transition
-    points together with values of their preceding regions. The last
-    data point may exceed chromosome end, unless a chromosome
-    dictionary is given. Remember the coordinations in bedGraph and
-    this class is 0-indexed and right-open.
-
-    """
-    cdef:
-        dict __data
-        public float32_t maxvalue
-        public float32_t minvalue
-        public float32_t baseline_value
-
-    def __init__ (self, float32_t baseline_value=0 ):
-        """
-        baseline_value is the value to fill in the regions not defined
-        in bedGraph. For example, if the bedGraph is like:
-
-        chr1  100 200  1
-        chr1  250 350  2
-
-        Then the region chr1:200..250 should be filled with baseline_value.
-
-        """
-        self.__data = {}
-        self.maxvalue = -10000000 # initial maximum value is tiny since I want safe_add_loc to update it
-        self.minvalue = 10000000  # initial minimum value is large since I want safe_add_loc to update it
-        self.baseline_value = baseline_value
-
-    cpdef add_loc ( self, bytes chromosome, int32_t startpos, int32_t endpos, float32_t value):
-        """Add a chr-start-end-value block into __data dictionary.
-
-        Note, we don't check if the add_loc is called continuously on
-        sorted regions without any gap. So we only suggest calling
-        this function within MACS.
-
-        """
-        cdef float32_t pre_v
-        # basic assumption, end pos should > start pos
-
-        if endpos <= 0:
-            return
-        if startpos < 0:
-            startpos = 0
-
-        if chromosome not in self.__data:
-            self.__data[chromosome] = [ pyarray('i',[]), pyarray('f',[]) ]
-            c = self.__data[chromosome]
-            if startpos:
-                # start pos is not 0, then add two blocks, the first
-                # with "baseline_value"; the second with "value"
-                c[0].append(startpos)
-                c[1].append(self.baseline_value)
-            c[0].append(endpos)
-            c[1].append(value)
-        else:
-            c = self.__data[chromosome]
-            # get the preceding region
-            pre_v   = c[1][-1]
-
-            # if this region is next to the previous one.
-            if pre_v == value:
-                # if value is the same, simply extend it.
-                c[0][-1] = endpos
-            else:
-                # otherwise, add a new region
-                c[0].append(endpos)
-                c[1].append(value)
-
-        if value > self.maxvalue:
-            self.maxvalue = value
-        if value < self.minvalue:
-            self.minvalue = value
-
-    cpdef add_loc_wo_merge ( self, bytes chromosome, int32_t startpos, int32_t endpos, float32_t value):
-        """Add a chr-start-end-value block into __data dictionary.
-
-        Note, we don't check if the add_loc is called continuously on
-        sorted regions without any gap. So we only suggest calling
-        this function within MACS.
-
-        This one won't merge nearby ranges with the same value
-        """
-        if endpos <= 0:
-            return
-        if startpos < 0:
-            startpos = 0
-
-        if value < self.baseline_value:
-            value = self.baseline_value
-            
-        if chromosome not in self.__data:
-            self.__data[chromosome] = [ pyarray('i',[]), pyarray('f',[]) ]
-            c = self.__data[chromosome]
-            if startpos:
-                # start pos is not 0, then add two blocks, the first
-                # with "baseline_value"; the second with "value"
-                c[0].append(startpos)
-                c[1].append(self.baseline_value)
-        c = self.__data[chromosome]
-        c[0].append(endpos)
-        c[1].append(value)
-        if value > self.maxvalue:
-            self.maxvalue = value
-        if value < self.minvalue:
-            self.minvalue = value
-
-    cpdef add_chrom_data( self, bytes chromosome, object p, object v ):
-        """Add a pv data to a chromosome. Replace the previous data.
-
-        p: a pyarray object 'i' for positions
-        v: a pyarray object 'f' for values
-
-        Note: no checks for error, use with caution
-        """
-        cdef:
-            float32_t maxv, minv
-
-        self.__data[ chromosome ] = [ p, v ]
-        maxv = max( v )
-        minv = min( v )
-        if maxv > self.maxvalue:
-            self.maxvalue = maxv
-        if minv < self.minvalue:
-            self.minvalue = minv
-        return
-
-    cpdef add_chrom_data_hmmr_PV( self, bytes chromosome, object pv ):
-        """Add a pv data to a chromosome. Replace the previous data.
-
-        This is a kinda silly function to waste time and convert a PV
-        array (2-d named numpy array) into two python arrays for this
-        BedGraph class. May have better function later.
-
-        Note: no checks for error, use with caution
-        """
-        cdef:
-            float32_t maxv, minv
-            int32_t i
-
-        self.__data[ chromosome ] = [ pyarray('i', pv['p']), pyarray('f',pv['v']) ]
-        minv = pv['v'].min()
-        maxv = pv['p'].max()
-        if maxv > self.maxvalue:
-            self.maxvalue = maxv
-        if minv < self.minvalue:
-            self.minvalue = minv
-        return
-    
-    cpdef bool destroy ( self ):
-        """ destroy content, free memory.
-        """
-        cdef:
-            set chrs
-            bytes chrom
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            if chrom in self.__data:
-                self.__data[chrom] = [None, None]
-                self.__data.pop(chrom)
-        return True
-
-    cpdef list get_data_by_chr (self, bytes chromosome):
-        """Return array of counts by chromosome.
-
-        The return value is a tuple:
-        ([end pos],[value])
-        """
-        if chromosome in self.__data:
-            return self.__data[chromosome]
-        else:
-            return []
-
-    cpdef set get_chr_names (self):
-        """Return all the chromosome names stored.
-
-        """
-        return set(sorted(self.__data.keys()))
-
-    cpdef void reset_baseline (self, float32_t baseline_value):
-        """Reset baseline value to baseline_value.
-
-        So any region between self.baseline_value and baseline_value
-        will be set to baseline_value.
-
-        """
-        self.baseline_value = baseline_value
-        self.filter_score(cutoff=baseline_value)
-        self.merge_regions()
-        return
-
-    cdef merge_regions (self):
-        """Merge nearby regions with the same value.
-
-        """
-        cdef:
-            int32_t new_pre_pos, pos, i
-            float32_t new_pre_value, value
-            bytes chrom
-            set chrs
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            (p,v) = self.__data[chrom]
-            pnext = iter(p).__next__
-            vnext = iter(v).__next__
-
-            # new arrays
-            new_pos = pyarray('L',[pnext(),])
-            new_value = pyarray('f',[vnext(),])
-
-            newpa = new_pos.append
-            newva = new_value.append
-
-            new_pre_pos = new_pos[0]
-            new_pre_value = new_value[0]
-
-            for i in range(1,len(p)):
-                pos = pnext()
-                value = vnext()
-                if value == new_pre_value:
-                    new_pos[-1] = pos
-                else:
-                    # add new region
-                    newpa(pos)
-                    newva(value)
-                    new_pre_pos = pos
-                    new_pre_value = value
-            self.__data[chrom] = [new_pos,new_value]
-        return True
-
-    cpdef bool filter_score (self, float32_t cutoff=0):
-        """Filter using a score cutoff. Any region lower than score
-        cutoff will be set to self.baseline_value.
-
-        Self will be modified.
-        """
-        cdef:
-            int32_t new_pre_pos, pos, i
-            float32_t new_pre_value, value
-            bytes chrom
-            set chrs
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            (p,v) = self.__data[chrom]
-            pnext = iter(p).__next__
-            vnext = iter(v).__next__
-
-            # new arrays
-            new_pos = pyarray('L',[])
-            new_value = pyarray('f',[])
-            new_pre_pos = 0
-            new_pre_value = 0
-
-            for i in range(len(p)):
-                pos = pnext()
-                value = vnext()
-
-                if value < cutoff:
-                    # this region will be set to baseline_value
-                    if new_pre_value == self.baseline_value:
-                        # if preceding region is at baseline, extend it
-                        new_pos[-1] = pos
-                    else:
-                        # else add a new baseline region
-                        new_pos.append(pos)
-                        new_value.append(self.baseline_value)
-                else:
-                    # put it into new arrays
-                    new_pos.append(pos)
-                    new_value.append(value)
-                new_pre_pos = new_pos[-1]
-                new_pre_value = new_value[-1]
-            self.__data[chrom]=[new_pos,new_value]
-        return True
-
-    cpdef tuple summary (self):
-        """Calculate the sum, total_length, max, min, mean, and std. 
-
-        Return a tuple for (sum, total_length, max, min, mean, std).
-        """
-        cdef:
-            int64_tn_v
-            float32_t sum_v, max_v, min_v, mean_v, variance, tmp, std_v
-            int32_t pre_p, l, i
-
-        pre_p = 0
-        n_v = 0
-        sum_v = 0
-        max_v = -100000
-        min_v = 100000
-        for (p,v) in self.__data.values():
-            # for each chromosome
-            pre_p = 0
-            for i in range(len(p)):
-                # for each region
-                l = p[i]-pre_p
-                sum_v += v[i]*l
-                n_v += l
-                pre_p = p[i]
-            max_v = max(max(v),max_v)
-            min_v = min(min(v),min_v)
-        mean_v = sum_v/n_v
-        variance = 0.0
-        for (p,v) in self.__data.values():
-            for i in range(len(p)):
-                # for each region
-                tmp = v[i]-mean_v
-                l = p[i]-pre_p
-                variance += tmp*tmp*l
-                pre_p = p[i]
-
-        variance /= float(n_v-1)
-        std_v = sqrt(variance)
-        return (sum_v, n_v, max_v, min_v, mean_v, std_v)
-
-    cpdef object call_peaks (self, float32_t cutoff=1,
-                             int32_t min_length=200, int32_t max_gap=50,
-                             bool call_summits=False):
-        """This function try to find regions within which, scores
-        are continuously higher than a given cutoff.
-
-        This function is NOT using sliding-windows. Instead, any
-        regions in bedGraph above certain cutoff will be detected,
-        then merged if the gap between nearby two regions are below
-        max_gap. After this, peak is reported if its length is above
-        min_length.
-
-        cutoff:  cutoff of value, default 1.
-        min_length :  minimum peak length, default 200.
-        gap   :  maximum gap to merge nearby peaks, default 50.
-
-        Removed option:
-
-        up_limit: the highest acceptable value. Default 10^{310}
-          * so only allow peak with value >=cutoff and <=up_limit
-
-        This does not work. The region above upper limit may still be
-        included as `gap` .
-
-        """
-        cdef:
-            int32_t peak_length, x, pre_p, p, i, summit, tstart, tend
-            float32_t v, summit_value, tvalue
-            bytes chrom
-            set chrs
-            object peaks
-
-        chrs = self.get_chr_names()
-        peaks = PeakIO()                      # dictionary to save peaks
-        for chrom in sorted(chrs):
-            peak_content = None
-            peak_length = 0
-            (ps,vs) = self.get_data_by_chr(chrom) # arrays for position and values
-            psn = iter(ps).__next__         # assign the next function to a viable to speed up
-            vsn = iter(vs).__next__
-            x = 0
-            pre_p = 0                   # remember previous position
-            while True:
-                # find the first region above cutoff
-                try:                    # try to read the first data range for this chrom
-                    p = psn()
-                    v = vsn()
-                except:
-                    break
-                x += 1                  # index for the next point
-                if v >= cutoff:
-                    peak_content = [(pre_p,p,v),]
-                    pre_p = p
-                    break               # found the first range above cutoff
-                else:
-                    pre_p = p
-
-            for i in range(x,len(ps)):
-                # continue scan the rest regions
-                p = psn()
-                v = vsn()
-                if v < cutoff: # not be detected as 'peak'
-                    pre_p = p
-                    continue
-                # for points above cutoff
-                # if the gap is allowed
-                if pre_p - peak_content[-1][1] <= max_gap:
-                    peak_content.append((pre_p,p,v))
-                else:
-                    # when the gap is not allowed, close this peak
-                    self.__close_peak(peak_content, peaks, min_length, chrom) #, smoothlen=max_gap / 2 )
-                    # start a new peak
-                    peak_content = [(pre_p,p,v),]
-                pre_p = p
-
-            # save the last peak
-            if not peak_content:
-                continue
-            self.__close_peak(peak_content, peaks, min_length, chrom) #, smoothlen=max_gap / 2 )
-        return peaks
-
-    cdef bool __close_peak( self, list peak_content, object peaks, int32_t min_length, bytes chrom ):
-        cdef:
-            list tsummit        # list for temporary summits
-            int32_t peak_length, summit, tstart, tend
-            float32_t summit_value, tvalue
-            
-        peak_length = peak_content[-1][1]-peak_content[0][0]
-        if peak_length >= min_length: # if the peak is too small, reject it
-            tsummit = []
-            summit = 0
-            summit_value = 0
-            for (tstart,tend,tvalue) in peak_content:
-                if not summit_value or summit_value < tvalue:
-                    tsummit = [<int32_t>((tend+tstart)/2),]
-                    summit_value = tvalue
-                elif summit_value == tvalue:
-                    tsummit.append( <int32_t>((tend+tstart)/2) )
-            summit = tsummit[<int32_t>((len(tsummit)+1)/2)-1 ]
-            peaks.add( chrom,
-                       peak_content[0][0],
-                       peak_content[-1][1],
-                       summit      = summit,
-                       peak_score  = summit_value,
-                       pileup      = 0,
-                       pscore      = 0,
-                       fold_change = 0,
-                       qscore      = 0
-                       )
-            return True
-
-    cpdef object call_broadpeaks (self, float32_t lvl1_cutoff=500, float32_t lvl2_cutoff=100,
-                                  int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400):
-        """This function try to find enriched regions within which,
-        scores are continuously higher than a given cutoff for level
-        1, and link them using the gap above level 2 cutoff with a
-        maximum length of lvl2_max_gap.
-
-        lvl1_cutoff:  cutoff of value at enriched regions, default 500.
-        lvl2_cutoff:  cutoff of value at linkage regions, default 100.
-        min_length :  minimum peak length, default 200.
-        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
-        lvl2_max_gap   :  maximum length of linkage regions, default 400.
-        colname: can be 'sample','control','-100logp','-100logq'. Cutoff will be applied to the specified column.
-
-        Return both general PeakIO object for highly enriched regions
-        and gapped broad regions in BroadPeakIO.
-        """
-        cdef:
-            bytes chrom
-            int32_t i, j
-            set chrs
-            object lvl1, lvl2   # PeakContent class object
-            list temppeakset, lvl1peakschrom, lvl2peakschrom
-            
-        assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
-        assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
-        lvl1_peaks = self.call_peaks( cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap, call_summits=False )
-        lvl2_peaks = self.call_peaks( cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap, call_summits=False )
-        chrs = lvl1_peaks.get_chr_names()
-        broadpeaks = BroadPeakIO()
-        # use lvl2_peaks as linking regions between lvl1_peaks
-        for chrom in sorted(chrs):
-            lvl1peakschrom = lvl1_peaks.get_data_from_chrom(chrom)
-            lvl2peakschrom = lvl2_peaks.get_data_from_chrom(chrom)
-            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
-            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
-            # our assumption is lvl1 regions should be included in lvl2 regions
-            try:
-                lvl1 = lvl1peakschrom_next()
-                for i in range( len(lvl2peakschrom) ):
-                    # for each lvl2 peak, find all lvl1 peaks inside
-                    lvl2 = lvl2peakschrom[i]
-                    while True:
-                        if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
-                            tmppeakset.append(lvl1)
-                            lvl1 = lvl1peakschrom_next()
-                        else:
-                            self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                            tmppeakset = []
-                            break
-            except StopIteration:
-                self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                tmppeakset = []
-                for j in range( i+1, len(lvl2peakschrom) ):
-                    self.__add_broadpeak ( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset)
-        return broadpeaks
-
-    cdef object __add_broadpeak (self, object bpeaks, bytes chrom, object lvl2peak, list lvl1peakset):
-        """Internal function to create broad peak.
-
-        """
-        cdef:
-            int32_t start, end, blockNum
-            bytes blockSizes, blockStarts, thickStart, thickEnd
-
-        start      = lvl2peak["start"]
-        end        = lvl2peak["end"]
-
-        # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
-        if not lvl1peakset:
-            # try:
-            # will complement by adding 1bps start and end to this region
-            # may change in the future if gappedPeak format was improved.
-            bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
-                       blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
-                       pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                       qscore = lvl2peak["qscore"] )
-            return bpeaks
-
-        thickStart = b"%d" % lvl1peakset[0]["start"]
-        thickEnd   = b"%d" % lvl1peakset[-1]["end"]
-        blockNum   = len(lvl1peakset)
-        blockSizes = b",".join( [b"%d" % x["length"] for x in lvl1peakset] )
-        blockStarts = b",".join( [b"%d" % (x["start"]-start) for x in lvl1peakset] )
-
-        if int(thickStart) != start:
-            # add 1bp left block
-            thickStart = b"%d" % start
-            blockNum += 1
-            blockSizes = b"1,"+blockSizes
-            blockStarts = b"0,"+blockStarts
-        if int(thickEnd) != end:
-            # add 1bp right block
-            thickEnd = b"%d" % end
-            blockNum += 1
-            blockSizes = blockSizes+b",1"
-            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
-
-        bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
-                   blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts,  pileup = lvl2peak["pileup"],
-                   pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                   qscore = lvl2peak["qscore"] )
-        return bpeaks
-
-    cpdef object refine_peaks (self, object peaks):
-        """This function try to based on given peaks, re-evaluate the
-        peak region, call the summit.
-
-        peaks: PeakIO object
-        
-        return: a new PeakIO object
-
-        """
-        cdef:
-            int32_t peak_length, x, pre_p, p, i, peak_s, peak_e
-            float32_t v
-            bytes chrom
-            set chrs
-            object new_peaks
-
-        peaks.sort()
-        new_peaks = PeakIO()
-        chrs = self.get_chr_names()
-        assert isinstance(peaks, PeakIO)
-        chrs = chrs.intersection(set(peaks.get_chr_names()))
-        
-        for chrom in sorted(chrs):
-            peaks_chr = peaks.get_data_from_chrom(chrom)
-            peak_content = []
-            (ps,vs) = self.get_data_by_chr(chrom) # arrays for position and values
-            psn = iter(ps).__next__         # assign the next function to a viable to speed up
-            vsn = iter(vs).__next__
-            peakn = iter(peaks_chr).__next__
-
-            pre_p = 0                   # remember previous position in bedgraph/self
-            p = psn()
-            v = vsn()
-            peak = peakn()
-            peak_s = peak["start"]
-            peak_e = peak["end"]
-            
-            while True:
-                # look for overlap
-                if p > peak_s and peak_e > pre_p:
-                    # now put four coordinates together and pick the middle two
-                    s, e = sorted([p, peak_s, peak_e, pre_p])[1:3]
-                    # add this content
-                    peak_content.append( (s, e, v) )
-                    # move self/bedGraph
-                    try:
-                        pre_p = p
-                        p = psn()
-                        v = vsn()
-                    except:
-                        # no more value chunk in bedGraph
-                        break
-                elif pre_p >= peak_e:
-                    # close peak
-                    self.__close_peak(peak_content, new_peaks, 0, chrom)
-                    peak_content = []
-                    # move peak
-                    try:
-                        peak = peakn()
-                        peak_s = peak["start"]
-                        peak_e = peak["end"]
-                    except:
-                        # no more peak
-                        break
-                elif peak_s >= p:
-                    # move self/bedgraph
-                    try:
-                        pre_p = p
-                        p = psn()
-                        v = vsn()
-                    except:
-                        # no more value chunk in bedGraph
-                        break
-                else:
-                    raise Exception(f"no way here! prev position:{pre_p}; position:{p}; value:{v}; peak start:{peak_s}; peak end:{peak_e}")
-
-            # save the last peak
-            if peak_content:
-                self.__close_peak(peak_content, new_peaks, 0, chrom)
-        return new_peaks
-
-    
-    cpdef int32_t total (self):
-        """Return the number of regions in this object.
-
-        """
-        cdef:
-            int32_t t
-        t = 0
-        for ( p, v ) in self.__data.values():
-            t += len(p)
-        return t
-
-    cpdef object set_single_value (self, float32_t new_value):
-        """Change all the values in bedGraph to the same new_value,
-        return a new bedGraphTrackI.
-
-        """
-        cdef:
-            bytes chrom
-            int32_t max_p
-            object ret
-
-        ret = bedGraphTrackI()
-        chroms = set(self.get_chr_names())
-        for chrom in sorted(chroms):
-            (p1,v1) = self.get_data_by_chr(chrom) # arrays for position and values
-            # maximum p
-            max_p = max(p1)
-            # add a region from 0 to max_p
-            ret.add_loc(chrom,0,max_p,new_value)
-        return ret
-
-    cpdef object overlie (self, object bdgTracks, str func="max" ):
-        """Calculate two or more bedGraphTrackI objects by letting self
-        overlying bdgTrack2, with user-defined functions.
-
-        Transition positions from both bedGraphTrackI objects will be
-        considered and combined. For example:
-
-           #1 bedGraph (self)   |      #2 bedGraph
-        -----------------------------------------------
-        chr1  0    100  0       | chr1    0    150  1
-        chr1  100  200  3       | chr1    150  250  2
-        chr1  200  300  4       | chr1    250  300  4
-
-        these two bedGraphs will be combined to have five transition
-        points: 100, 150, 200, 250, and 300. So in order to calculate
-        two bedGraphs, I pair values within the following regions
-        like:
-
-        chr   s   e     (#1,#2)   applied_func_max
-        -----------------------------------------------
-        chr1  0   100   (0,1)     1
-        chr1  100 150   (3,1)     3
-        chr1  150 200   (3,2)     3
-        chr1  200 250   (4,2)     4
-        chr1  250 300   (4,4)     4
-
-        Then the given 'func' will be applied on each 2-tuple as func(#1,#2)
-
-        Supported 'func' are "sum", "subtract" (only for two bdg
-        objects), "product", "divide" (only for two bdg objects),
-        "max", "mean" and "fisher".
-
-        Return value is a new bedGraphTrackI object.
-
-        Option: bdgTracks can be a list of bedGraphTrackI objects
-        """
-        cdef:
-            int32_t pre_p, p1, p2
-            float32_t v1, v2
-            bytes chrom
-
-        nr_tracks = len(bdgTracks) + 1  # +1 for self
-        assert nr_tracks >= 2, "Specify at least one more bdg objects."
-        for i, bdgTrack in enumerate(bdgTracks):
-            assert isinstance(bdgTrack, bedGraphTrackI), "bdgTrack{} is not a bedGraphTrackI object".format(i + 1)
-
-        if func == "max":
-            f = max
-        elif func == "mean":
-            f = mean_func
-        elif func == "fisher":
-            f = fisher_func
-        elif func == "sum":
-            f = sum
-        elif func == "product":
-            f = product_func
-        elif func == "subtract":
-            if nr_tracks == 2:
-                f = subtract_func
-            else:
-                raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
-        elif func == "divide":
-            if nr_tracks == 2:
-                f = divide_func
-            else:
-                raise Exception(f"Only one more bdg object is allowed, but provided {nr_tracks-1}")
-        else:
-            raise Exception("Invalid function {func}! Choose from 'sum', 'subtract' (only for two bdg objects), 'product', 'divide' (only for two bdg objects), 'max', 'mean' and 'fisher'. ")
-
-        ret = bedGraphTrackI()
-        retadd = ret.add_loc
-
-        common_chr = set(self.get_chr_names())
-        for track in bdgTracks:
-            common_chr = common_chr.intersection(set(track.get_chr_names()))
-
-        for chrom in sorted(common_chr):
-            datas = [self.get_data_by_chr(chrom)]
-            datas.extend([bdgTracks[i].get_data_by_chr(chrom) for i in range(len(bdgTracks))])
-
-            ps, vs, pn, vn = [], [], [], []
-            for data in datas:
-                ps.append(data[0])
-                pn.append(iter(ps[-1]).__next__)
-                vs.append(data[1])
-                vn.append(iter(vs[-1]).__next__)
-
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
-            try:
-                ps_cur = [pn[i]() for i in range(len(pn))]
-                vs_cur = [vn[i]() for i in range(len(pn))]
-
-                while True:
-                    # get the lowest position
-                    lowest_p = min(ps_cur)
-
-                    # at least one lowest position, could be multiple
-                    locations = [i for i in range(len(ps_cur)) if ps_cur[i] == lowest_p]
-
-                    # add the data until the interval
-                    ret.add_loc(chrom, pre_p, ps_cur[locations[0]], f(vs_cur))
-
-                    pre_p = ps_cur[locations[0]]
-                    for index in locations:
-                        ps_cur[index] = pn[index]()
-                        vs_cur[index] = vn[index]()
-            except StopIteration:
-                # meet the end of either bedGraphTrackI, simply exit
-                pass
-        return ret
-
-    cpdef bool apply_func ( self, func ):
-        """Apply function 'func' to every value in this bedGraphTrackI object.
-
-        *Two adjacent regions with same value after applying func will
-        not be merged.
-        """
-        cdef int32_t i
-
-        for (p,s) in self.__data.values():
-            for i in range(len(s)):
-                s[i] = func(s[i])
-        self.maxvalue = func(self.maxvalue)
-        self.minvalue = func(self.minvalue)
-        return True
-
-    cpdef p2q ( self ):
-        """Convert pvalue scores to qvalue scores.
-
-        *Assume scores in this bedGraph are pvalue scores! Not work
-         for other type of scores.
-        """
-        cdef:
-            bytes chrom
-            object pos_array, pscore_array
-            dict pvalue_stat = {}
-            dict pqtable = {}
-            int64_t n, pre_p, this_p, length, j, pre_l, l, i
-            float32_t this_v, pre_v, v, q, pre_q, this_t, this_c
-            int64_t N, k, this_l
-            float32_t f
-            int64_t nhcal = 0
-            int64_t npcal = 0
-            list unique_values
-            float32_t t0, t1, t
-
-        # calculate frequencies of each p-score
-        for chrom in sorted(self.get_chr_names()):
-            pre_p = 0
-
-            [pos_array, pscore_array] = self.__data[ chrom ]
-
-            pn = iter(pos_array).__next__
-            vn = iter(pscore_array).__next__
-
-            for i in range( len( pos_array ) ):
-                this_p = pn()
-                this_v = vn()
-                this_l = this_p - pre_p
-                if this_v in pvalue_stat:
-                    pvalue_stat[ this_v ] += this_l
-                else:
-                    pvalue_stat[ this_v ] = this_l
-                pre_p = this_p
-
-            nhcal += len( pos_array )
-
-        nhval = 0
-
-        N = sum(pvalue_stat.values()) # total length
-        k = 1                           # rank
-        f = -log10(N)
-        pre_v = -2147483647
-        pre_l = 0
-        pre_q = 2147483647              # save the previous q-value
-
-        # calculate qscore for each pscore
-        pqtable = {}
-        unique_values = sorted(pvalue_stat.keys(), reverse=True)
-        for i in range(len(unique_values)):
-            v = unique_values[i]
-            l = pvalue_stat[v]
-            q = v + (log10(k) + f)
-            q = max(0,min(pre_q,q))           # make q-score monotonic
-            pqtable[ v ] = q
-            pre_v = v
-            pre_q = q
-            k+=l
-            nhcal += 1
-
-        # convert pscore to qscore
-        for chrom in sorted(self.get_chr_names()):
-            [pos_array, pscore_array] = self.__data[ chrom ]
-
-            for i in range( len( pos_array ) ):
-                pscore_array[ i ] = pqtable[ pscore_array[ i ] ]
-
-        self.merge_regions()
-        return
-
-
-    cpdef object extract_value ( self, object bdgTrack2 ):
-        """Extract values from regions defined in bedGraphTrackI class object
-        `bdgTrack2`.
-
-        """
-        cdef:
-            int32_t pre_p, p1, p2, i
-            float32_t v1, v2
-            bytes chrom
-            object ret
-
-        assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"
-
-        ret = [ [], pyarray('f',[]), pyarray('L',[]) ] # 1: region in bdgTrack2; 2: value; 3: length with the value
-        radd = ret[0].append
-        vadd = ret[1].append
-        ladd = ret[2].append
-
-        chr1 = set(self.get_chr_names())
-        chr2 = set(bdgTrack2.get_chr_names())
-        common_chr = chr1.intersection(chr2)
-        for i in range( len( common_chr ) ):
-            chrom = common_chr.pop()
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
-            v1n = iter(v1s).__next__
-
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
-            v2n = iter(v2s).__next__
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
-            try:
-                p1 = p1n()
-                v1 = v1n()
-
-                p2 = p2n()
-                v2 = v2n()
-
-                while True:
-                    if p1 < p2:
-                        # clip a region from pre_p to p1, then set pre_p as p1.
-                        if v2>0:
-                            radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
-                            vadd(v1)
-                            ladd(p1-pre_p)
-                        pre_p = p1
-                        # call for the next p1 and v1
-                        p1 = p1n()
-                        v1 = v1n()
-                    elif p2 < p1:
-                        # clip a region from pre_p to p2, then set pre_p as p2.
-                        if v2>0:
-                            radd(str(chrom)+"."+str(pre_p)+"."+str(p2))
-                            vadd(v1)
-                            ladd(p2-pre_p)
-                        pre_p = p2
-                        # call for the next p2 and v2
-                        p2 = p2n()
-                        v2 = v2n()
-                    elif p1 == p2:
-                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        if v2>0:
-                            radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
-                            vadd(v1)
-                            ladd(p1-pre_p)
-                        pre_p = p1
-                        # call for the next p1, v1, p2, v2.
-                        p1 = p1n()
-                        v1 = v1n()
-                        p2 = p2n()
-                        v2 = v2n()
-            except StopIteration:
-                # meet the end of either bedGraphTrackI, simply exit
-                pass
-
-        return ret
-
-    cpdef object extract_value_hmmr ( self, object bdgTrack2 ):
-        """Extract values from regions defined in bedGraphTrackI class object
-        `bdgTrack2`.
-
-        I will try to tweak this function to output only the values of
-        bdgTrack1 (self) in the regions in bdgTrack2
-
-        This is specifically for HMMRATAC. bdgTrack2 should be a
-        bedgraph object containing the bins with value set to
-        'mark_bin' -- the bins in the same region will have the same
-        value.
-        """
-        cdef:
-             int32_t pre_p, p1, p2, i
-             float32_t v1, v2
-             bytes chrom
-             list ret
-
-        assert isinstance(bdgTrack2,bedGraphTrackI), "not a bedGraphTrackI object"
-
-        ret = [ [], pyarray('f',[]), pyarray('i',[]) ] # 0: bin location (chrom, position); 1: value; 2: number of bins in this region
-        padd = ret[0].append
-        vadd = ret[1].append
-        ladd = ret[2].append
-
-        chr1 = set(self.get_chr_names())
-        chr2 = set(bdgTrack2.get_chr_names())
-        common_chr = sorted(list(chr1.intersection(chr2)))
-        for i in range( len( common_chr ) ):
-            chrom = common_chr.pop()
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
-            v1n = iter(v1s).__next__
-
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
-            v2n = iter(v2s).__next__
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
-            try:
-                p1 = p1n()
-                v1 = v1n()
-
-                p2 = p2n()
-                v2 = v2n()
-
-                while True:
-                    if p1 < p2:
-                        # clip a region from pre_p to p1, then set pre_p as p1.
-                        # in this case, we don't output any
-                        #if v2>0:
-                        #    radd(str(chrom)+"."+str(pre_p)+"."+str(p1))
-                        #    vadd(v1)
-                        #    ladd(p1-pre_p)
-                        pre_p = p1
-                        # call for the next p1 and v1
-                        p1 = p1n()
-                        v1 = v1n()
-                    elif p2 < p1:
-                        # clip a region from pre_p to p2, then set pre_p as p2.
-                        if v2 != 0: #0 means it's a gap region, we should have value > 1
-                            padd( (chrom, p2) )
-                            vadd(v1)
-                            ladd(int(v2))
-                        pre_p = p2
-                        # call for the next p2 and v2
-                        p2 = p2n()
-                        v2 = v2n()
-                    elif p1 == p2:
-                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        if v2 != 0: #0 means it's a gap region, we should have 1 or -1
-                            padd( (chrom, p2) )
-                            vadd(v1)
-                            ladd(int(v2))
-                        pre_p = p1
-                        # call for the next p1, v1, p2, v2.
-                        p1 = p1n()
-                        v1 = v1n()
-                        p2 = p2n()
-                        v2 = v2n()
-            except StopIteration:
-                # meet the end of either bedGraphTrackI, simply exit
-                pass
-
-        return ret
-
-    cpdef make_ScoreTrackII_for_macs (self, object bdgTrack2, float32_t depth1 = 1.0, float32_t depth2 = 1.0 ):
-        """A modified overlie function for MACS v2.
-
-        effective_depth_in_million: sequencing depth in million after
-                                    duplicates being filtered. If
-                                    treatment is scaled down to
-                                    control sample size, then this
-                                    should be control sample size in
-                                    million. And vice versa.
-
-        Return value is a ScoreTrackII object.
-        """
-        cdef:
-            int32_t pre_p, p1, p2
-            float32_t v1, v2
-            bytes chrom
-            object ret
-
-        assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"
-
-        ret = ScoreTrackII( treat_depth = depth1, ctrl_depth = depth2 )
-        retadd = ret.add
-
-        chr1 = set(self.get_chr_names())
-        chr2 = set(bdgTrack2.get_chr_names())
-        common_chr = chr1.intersection(chr2)
-        for chrom in sorted(common_chr):
-
-            (p1s,v1s) = self.get_data_by_chr(chrom) # arrays for position and values
-            p1n = iter(p1s).__next__         # assign the next function to a viable to speed up
-            v1n = iter(v1s).__next__
-
-            (p2s,v2s) = bdgTrack2.get_data_by_chr(chrom) # arrays for position and values
-            p2n = iter(p2s).__next__         # assign the next function to a viable to speed up
-            v2n = iter(v2s).__next__
-
-            chrom_max_len = len(p1s)+len(p2s) # this is the maximum number of locations needed to be recorded in scoreTrackI for this chromosome.
-
-            ret.add_chromosome(chrom,chrom_max_len)
-
-            pre_p = 0                   # remember the previous position in the new bedGraphTrackI object ret
-
-            try:
-                p1 = p1n()
-                v1 = v1n()
-
-                p2 = p2n()
-                v2 = v2n()
-
-                while True:
-                    if p1 < p2:
-                        # clip a region from pre_p to p1, then set pre_p as p1.
-                        retadd( chrom, p1, v1, v2 )
-                        pre_p = p1
-                        # call for the next p1 and v1
-                        p1 = p1n()
-                        v1 = v1n()
-                    elif p2 < p1:
-                        # clip a region from pre_p to p2, then set pre_p as p2.
-                        retadd( chrom, p2, v1, v2 )
-                        pre_p = p2
-                        # call for the next p2 and v2
-                        p2 = p2n()
-                        v2 = v2n()
-                    elif p1 == p2:
-                        # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                        retadd( chrom, p1, v1, v2 )
-                        pre_p = p1
-                        # call for the next p1, v1, p2, v2.
-                        p1 = p1n()
-                        v1 = v1n()
-                        p2 = p2n()
-                        v2 = v2n()
-            except StopIteration:
-                # meet the end of either bedGraphTrackI, simply exit
-                pass
-
-        ret.finalize()
-        #ret.merge_regions()
-        return ret
-
-    cpdef str cutoff_analysis ( self, int32_t max_gap, int32_t min_length, int32_t steps = 100, float32_t min_score = 0, float32_t max_score = 1000 ):
-        """
-        Cutoff analysis function for bedGraphTrackI object.
-    
-        This function will try all possible cutoff values on the score
-        column to call peaks. Then will give a report of a number of
-        metrics (number of peaks, total length of peaks, average
-        length of peak) at varying score cutoffs. For each score
-        cutoff, the function finds the positions where the score
-        exceeds the cutoff, then groups those positions into "peaks"
-        based on the maximum allowed gap (max_gap) between consecutive
-        positions. If a peak's length exceeds the minimum length
-        (min_length), the peak is counted.
-
-        Parameters
-        ----------
-
-        max_gap : int32_t
-        Maximum allowed gap between consecutive positions above cutoff
-        
-        min_length : int32_t Minimum length of peak
-        steps: int32_t
-        It will be used to calculate 'step' to increase from min_v to
-        max_v (see below).
-
-        min_score: float32_t
-        Minimum score for cutoff analysis. Note1: we will take the
-        larger value between the actual minimum value in the BedGraph
-        and min_score as min_v. Note2: the min_v won't be included in
-        the final result. We will try to output the smallest cutoff as
-        min_v+step.
-
-        max_score: float32_t
-        Maximum score for cutoff analysis. Note1: we will take the
-        smaller value between the actual maximum value in the BedGraph
-        and max_score as max_v. Note2: the max_v may not be included
-        in the final result. We will only output the cutoff that can
-        generate at least 1 peak.
-
-        Returns
-        -------
-
-        Cutoff analysis report in str object.
-
-        Todos
-        -----
-
-        May need to separate this function out as a class so that we
-        can add more ways to analyze the result. Also, we can let this
-        function return a list of dictionary or data.frame in that
-        way, instead of str object.
-        
-        """
-        cdef:
-            set chrs
-            list peak_content, ret_list, cutoff_list, cutoff_npeaks, cutoff_lpeaks
-            bytes  chrom
-            str ret
-            float32_t cutoff
-            int64_t total_l, total_p, i, n, ts, te, lastp, tl, peak_length
-            #dict cutoff_npeaks, cutoff_lpeaks
-            float32_t s, midvalue
-
-        chrs = self.get_chr_names()
-
-        #midvalue = self.minvalue/2 + self.maxvalue/2
-        #s = float(self.minvalue - midvalue)/steps
-        minv = max( min_score, self.minvalue )
-        maxv = min( self.maxvalue, max_score )
-
-        s = float(maxv - minv)/steps
-
-        # a list of possible cutoff values from minv to maxv with step of s
-        cutoff_list = [round(value, 3) for value in np.arange(minv, maxv, s)]
-
-        cutoff_npeaks = [0] * len( cutoff_list )
-        cutoff_lpeaks = [0] * len( cutoff_list )
-
-        for chrom in sorted(chrs):
-            ( pos_array, score_array ) = self.__data[ chrom ]
-            pos_array = np.array( self.__data[ chrom ][ 0 ] )
-            score_array = np.array( self.__data[ chrom ][ 1 ] )
-
-            for n in range( len( cutoff_list ) ):
-                cutoff = cutoff_list[ n ]
-                total_l = 0           # total length of peaks
-                total_p = 0           # total number of peaks
-
-                # get the regions with scores above cutoffs
-                above_cutoff = np.nonzero( score_array > cutoff )[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-                above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-                above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
-
-                if above_cutoff_endpos.size == 0:
-                    continue
-
-                # first bit of region above cutoff
-                acs_next = iter(above_cutoff_startpos).__next__
-                ace_next = iter(above_cutoff_endpos).__next__
-
-                ts = acs_next()
-                te = ace_next()
-                peak_content = [( ts, te ), ]
-                lastp = te
-
-                for i in range( 1, above_cutoff_startpos.size ):
-                    ts = acs_next()
-                    te = ace_next()
-                    tl = ts - lastp
-                    if tl <= max_gap:
-                        peak_content.append( ( ts, te ) )
-                    else:
-                        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                        if peak_length >= min_length: # if the peak is too small, reject it
-                            total_l +=  peak_length
-                            total_p += 1
-                        peak_content = [ ( ts, te ), ]
-                    lastp = te
-
-                if peak_content:
-                    peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                    if peak_length >= min_length: # if the peak is too small, reject it
-                        total_l +=  peak_length
-                        total_p += 1
-                cutoff_lpeaks[ n ] += total_l
-                cutoff_npeaks[ n ] += total_p
-                
-        # prepare the returnning text
-        ret_list = ["score\tnpeaks\tlpeaks\tavelpeak\n"]
-        for n in range( len( cutoff_list )-1, -1, -1 ):
-            cutoff = cutoff_list[ n ]
-            if cutoff_npeaks[ n ] > 0:
-                ret_list.append("%.2f\t%d\t%d\t%.2f\n" % ( cutoff, cutoff_npeaks[ n ], \
-                                                           cutoff_lpeaks[ n ], \
-                                                           cutoff_lpeaks[ n ]/cutoff_npeaks[ n ] ))
-        ret = ''.join(ret_list)
-        return ret
-
-cdef np.ndarray calculate_elbows( np.ndarray values, float32_t threshold=0.01):
-    # although this function is supposed to find elbow pts for cutoff analysis, 
-    # however, in reality, it barely works...
-    cdef: 
-        np.ndarray deltas, slopes, delta_slopes, elbows
-        np.float32_t avg_delta_slope
-        
-    # Calculate the difference between each point and the first point
-    deltas = values - values[0]
-    
-    # Calculate the slope between each point and the last point
-    slopes = deltas / (values[-1] - values[0])
-    
-    # Calculate the change in slope
-    delta_slopes = np.diff(slopes)
-    
-    # Calculate the average change in slope
-    avg_delta_slope = np.mean(delta_slopes)
-    
-    # Find all points where the change in slope is significantly larger than the average
-    elbows = np.where(delta_slopes > avg_delta_slope + threshold)[0]
-    
-    return elbows
diff --git a/MACS3/Signal/CallPeakUnit.py b/MACS3/Signal/CallPeakUnit.py
new file mode 100644
index 00000000..2e99613b
--- /dev/null
+++ b/MACS3/Signal/CallPeakUnit.py
@@ -0,0 +1,2256 @@
+# cython: language_level=3
+# cython: profile=True
+# cython: linetrace=True
+# Time-stamp: <2024-10-22 11:42:37 Tao Liu>
+
+"""Module for Calculate Scores.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+
+import _pickle as cPickle
+from tempfile import mkstemp
+import os
+
+# ------------------------------------
+# Other modules
+# ------------------------------------
+import numpy as np
+import cython
+import cython.cimports.numpy as cnp
+# from numpy cimport int32_t, int64_t, float32_t, float64_t
+from cython.cimports.cpython import bool
+from cykhash import PyObjectMap, Float32to32Map
+
+# ------------------------------------
+# C lib
+# ------------------------------------
+from cython.cimports.libc.stdio import FILE, fopen, fprintf, fclose
+from cython.cimports.libc.math import exp, log10, log1p, erf, sqrt
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+from MACS3.Signal.SignalProcessing import maxima, enforce_peakyness
+from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
+from MACS3.Signal.FixWidthTrack import FWTrack
+from MACS3.Signal.PairedEndTrack import PETrackI
+from MACS3.Signal.Prob import poisson_cdf
+from MACS3.Utilities.Logger import logging
+
+logger = logging.getLogger(__name__)
+debug = logger.debug
+info = logger.info
+# --------------------------------------------
+# cached pscore function and LR_asym functions
+# --------------------------------------------
+pscore_dict = PyObjectMap()
+logLR_dict = PyObjectMap()
+
+
+@cython.cfunc
+def get_pscore(t: tuple) -> cython.float:
+    """t: tuple of (lambda, observation)
+    """
+    val: cython.float
+
+    if t in pscore_dict:
+        return pscore_dict[t]
+    else:
+        # calculate and cache
+        val = -1.0 * poisson_cdf(t[0], t[1], False, True)
+        pscore_dict[t] = val
+        return val
+
+
+@cython.cfunc
+def get_logLR_asym(t: tuple) -> cython.float:
+    """Calculate log10 Likelihood between H1 (enriched) and H0 (
+    chromatin bias). Set minus sign for depletion.
+    """
+    val: cython.float
+    x: cython.float
+    y: cython.float
+
+    if t in logLR_dict:
+        return logLR_dict[t]
+    else:
+        x = t[0]
+        y = t[1]
+        # calculate and cache
+        if x > y:
+            val = (x*(log10(x)-log10(y))+y-x)
+        elif x < y:
+            val = (x*(-log10(x)+log10(y))-y+x)
+        else:
+            val = 0
+        logLR_dict[t] = val
+        return val
+
+# ------------------------------------
+# constants
+# ------------------------------------
+
+
+LOG10_E: cython.float = 0.43429448190325176
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+
+@cython.cfunc
+def clean_up_ndarray(x: cnp.ndarray):
+    # clean numpy ndarray in two steps
+    i: cython.long
+
+    i = x.shape[0] // 2
+    x.resize(100000 if i > 100000 else i, refcheck=False)
+    x.resize(0, refcheck=False)
+    return
+
+
+@cython.cfunc
+@cython.inline
+def chi2_k1_cdf(x: cython.float) -> cython.float:
+    return erf(sqrt(x/2))
+
+
+@cython.cfunc
+@cython.inline
+def log10_chi2_k1_cdf(x: cython.float) -> cython.float:
+    return log10(erf(sqrt(x/2)))
+
+
+@cython.cfunc
+@cython.inline
+def chi2_k2_cdf(x: cython.float) -> cython.float:
+    return 1 - exp(-x/2)
+
+
+@cython.cfunc
+@cython.inline
+def log10_chi2_k2_cdf(x: cython.float) -> cython.float:
+    return log1p(- exp(-x/2)) * LOG10_E
+
+
+@cython.cfunc
+@cython.inline
+def chi2_k4_cdf(x: cython.float) -> cython.float:
+    return 1 - exp(-x/2) * (1 + x/2)
+
+
+@cython.cfunc
+@cython.inline
+def log10_chi2_k4_CDF(x: cython.float) -> cython.float:
+    return log1p(- exp(-x/2) * (1 + x/2)) * LOG10_E
+
+
+@cython.cfunc
+@cython.inline
+def apply_multiple_cutoffs(multiple_score_arrays: list,
+                           multiple_cutoffs: list) -> cnp.ndarray:
+    i: cython.int
+    ret: cnp.ndarray
+
+    ret = multiple_score_arrays[0] > multiple_cutoffs[0]
+
+    for i in range(1, len(multiple_score_arrays)):
+        ret += multiple_score_arrays[i] > multiple_cutoffs[i]
+
+    return ret
+
+
+@cython.cfunc
+@cython.inline
+def get_from_multiple_scores(multiple_score_arrays: list,
+                             index: cython.int) -> list:
+    ret: list = []
+    i: cython.int
+
+    for i in range(len(multiple_score_arrays)):
+        ret.append(multiple_score_arrays[i][index])
+    return ret
+
+
+@cython.cfunc
+@cython.inline
+def get_logFE(x: cython.float,
+              y: cython.float) -> cython.float:
+    """ return 100* log10 fold enrichment with +1 pseudocount.
+    """
+    return log10(x/y)
+
+
+@cython.cfunc
+@cython.inline
+def get_subtraction(x: cython.float,
+                    y: cython.float) -> cython.float:
+    """ return subtraction.
+    """
+    return x - y
+
+
+@cython.cfunc
+@cython.inline
+def getitem_then_subtract(peakset: list,
+                          start: cython.int) -> list:
+    a: list
+
+    a = [x["start"] for x in peakset]
+    for i in range(len(a)):
+        a[i] = a[i] - start
+    return a
+
+
+@cython.cfunc
+@cython.inline
+def left_sum(data, pos: cython.int,
+             width: cython.int) -> cython.int:
+    """
+    """
+    return sum([data[x] for x in data if x <= pos and x >= pos - width])
+
+
+@cython.cfunc
+@cython.inline
+def right_sum(data,
+              pos: cython.int,
+              width: cython.int) -> cython.int:
+    """
+    """
+    return sum([data[x] for x in data if x >= pos and x <= pos + width])
+
+
+@cython.cfunc
+@cython.inline
+def left_forward(data,
+                 pos: cython.int,
+                 window_size: cython.int) -> cython.int:
+    return data.get(pos, 0) - data.get(pos-window_size, 0)
+
+
+@cython.cfunc
+@cython.inline
+def right_forward(data,
+                  pos: cython.int,
+                  window_size: cython.int) -> cython.int:
+    return data.get(pos + window_size, 0) - data.get(pos, 0)
+
+
+@cython.cfunc
+def median_from_value_length(value: cnp.ndarray(cython.float, ndim=1),
+                             length: list) -> cython.float:
+    """
+    """
+    tmp: list
+    c: cython.int
+    tmp_l: cython.int
+    tmp_v: cython.float
+    mid_l: cython.float
+
+    c = 0
+    tmp = sorted(list(zip(value, length)))
+    mid_l = sum(length)/2
+    for (tmp_v, tmp_l) in tmp:
+        c += tmp_l
+        if c > mid_l:
+            return tmp_v
+
+
+@cython.cfunc
+def mean_from_value_length(value: cnp.ndarray(cython.float, ndim=1),
+                           length: list) -> cython.float:
+    """take of: list values and of: list corresponding lengths,
+    calculate the mean.  An important function for bedGraph type of
+    data.
+
+    """
+    i: cython.int
+    tmp_l: cython.int
+    ln: cython.int
+    tmp_v: cython.double
+    sum_v: cython.double
+    tmp_sum: cython.double
+    ret: cython.float
+
+    sum_v = 0
+    ln = 0
+
+    for i in range(len(length)):
+        tmp_l = length[i]
+        tmp_v = cython.cast(cython.double, value[i])
+        tmp_sum = tmp_v * tmp_l
+        sum_v = tmp_sum + sum_v
+        ln += tmp_l
+
+    ret = cython.cast(cython.float, (sum_v/ln))
+
+    return ret
+
+
+@cython.cfunc
+def find_optimal_cutoff(x: list, y: list) -> tuple:
+    """Return the best cutoff x and y.
+
+    We assume that total peak length increase exponentially while
+    decreasing cutoff value. But while cutoff decreases to a point
+    that background noises are captured, total length increases much
+    faster. So we fit a linear model by taking the first 10 points,
+    then look for the largest cutoff that
+
+
+    *Currently, it is coded as a useless function.
+    """
+    npx: cnp.ndarray
+    npy: cnp.ndarray
+    npA: cnp.ndarray
+    ln: cython.long
+    i: cython.long
+    m: cython.float
+    c: cython.float             # slop and intercept
+    sst: cython.float           # sum of squared total
+    sse: cython.float           # sum of squared error
+    rsq: cython.float           # R-squared
+
+    ln = len(x)
+    assert ln == len(y)
+    npx = np.array(x)
+    npy = np.log10(np.array(y))
+    npA = np.vstack([npx, np.ones(len(npx))]).T
+
+    for i in range(10, ln):
+        # at least the largest 10 points
+        m, c = np.linalg.lstsq(npA[:i], npy[:i], rcond=None)[0]
+        sst = sum((npy[:i] - np.mean(npy[:i])) ** 2)
+        sse = sum((npy[:i] - m*npx[:i] - c) ** 2)
+        rsq = 1 - sse/sst
+        # print i, x[i], y[i], m, c, rsq
+    return (1.0, 1.0)
+
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+@cython.cclass
+class CallerFromAlignments:
+    """A unit to calculate scores and call peaks from alignments --
+    FWTrack or PETrack objects.
+
+    It will compute for each chromosome separately in order to save
+    memory usage.
+    """
+    treat: object            # FWTrack or PETrackI object for ChIP
+    ctrl: object             # FWTrack or PETrackI object for Control
+
+    d: cython.int                           # extension size for ChIP
+    # extension sizes for Control. Can be multiple values
+    ctrl_d_s: list
+    treat_scaling_factor: cython.float       # scaling factor for ChIP
+    # scaling factor for Control, corresponding to each extension size.
+    ctrl_scaling_factor_s: list
+    # minimum local bias to fill missing values
+    lambda_bg: cython.float
+    # name of common chromosomes in ChIP and Control data
+    chromosomes: list
+    # the pseudocount used to calcuate logLR, FE or logFE
+    pseudocount: cython.double
+    # prefix will be added to _pileup.bdg for treatment and
+    # _lambda.bdg for control
+    bedGraph_filename_prefix: bytes
+    # shift of cutting ends before extension
+    end_shift: cython.int
+    # whether trackline should be saved in bedGraph
+    trackline: bool
+    # whether to save pileup and local bias in bedGraph files
+    save_bedGraph: bool
+    # whether to save pileup normalized by sequencing depth in million reads
+    save_SPMR: bool
+    # whether ignore local bias, and to use global bias instead
+    no_lambda_flag: bool
+    # whether it's in PE mode, will be detected during initiation
+    PE_mode: bool
+
+    # temporary data buffer
+    # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
+    chr_pos_treat_ctrl: list
+    bedGraph_treat_filename: bytes
+    bedGraph_control_filename: bytes
+    bedGraph_treat_f: cython.pointer(FILE)
+    bedGraph_ctrl_f: cython.pointer(FILE)
+
+    # data needed to be pre-computed before peak calling
+    # remember pvalue->qvalue convertion; saved in cykhash Float32to32Map
+    pqtable: Float32to32Map
+    # whether the pvalue of whole genome is all calculated. If yes,
+    # it's OK to calculate q-value.
+    pvalue_all_done: bool
+    # record for each pvalue cutoff, how many peaks can be called
+    pvalue_npeaks: dict
+    # record for each pvalue cutoff, the total length of called peaks
+    pvalue_length: dict
+    # automatically decide the p-value cutoff (can be translated into
+    # qvalue cutoff) based on p-value to total peak length analysis.
+    optimal_p_cutoff: cython.float
+    # file to save the pvalue-npeaks-totallength table
+    cutoff_analysis_filename: bytes
+    # Record the names of temporary files for storing pileup values of
+    # each chromosome
+    pileup_data_files: dict
+
+    def __init__(self,
+                 treat,
+                 ctrl,
+                 d: cython.int = 200,
+                 ctrl_d_s: list = [200, 1000, 10000],
+                 treat_scaling_factor: cython.float = 1.0,
+                 ctrl_scaling_factor_s: list = [1.0, 0.2, 0.02],
+                 stderr_on: bool = False,
+                 pseudocount: cython.float = 1,
+                 end_shift: cython.int = 0,
+                 lambda_bg: cython.float = 0,
+                 save_bedGraph: bool = False,
+                 bedGraph_filename_prefix: str = "PREFIX",
+                 bedGraph_treat_filename: str = "TREAT.bdg",
+                 bedGraph_control_filename: str = "CTRL.bdg",
+                 cutoff_analysis_filename: str = "TMP.txt",
+                 save_SPMR: bool = False):
+        """Initialize.
+
+        A calculator is unique to each comparison of treat and
+        control. Treat_depth and ctrl_depth should not be changed
+        during calculation.
+
+        treat and ctrl are either FWTrack or PETrackI objects.
+
+        treat_depth and ctrl_depth are effective depth in million:
+                                    sequencing depth in million after
+                                    duplicates being filtered. If
+                                    treatment is scaled down to
+                                    control sample size, then this
+                                    should be control sample size in
+                                    million. And vice versa.
+
+        d, sregion, lregion: d is the fragment size, sregion is the
+                             small region size, lregion is the large
+                             region size
+
+        pseudocount: a pseudocount used to calculate logLR, FE or
+                     logFE. Please note this value will not be changed
+                     with normalization method. So if you really want
+                     to pseudocount: set 1 per million reads, it: set
+                     after you normalize treat and control by million
+                     reads by `change_normalizetion_method(ord('M'))`.
+
+        """
+        chr1: set
+        chr2: set
+        p: cython.float
+
+        # decide PE mode
+        if isinstance(treat, FWTrack):
+            self.PE_mode = False
+        elif isinstance(treat, PETrackI):
+            self.PE_mode = True
+        else:
+            raise Exception("Should be FWTrack or PETrackI object!")
+        # decide if there is control
+        self.treat = treat
+        if ctrl:
+            self.ctrl = ctrl
+        else:                   # while there is no control
+            self.ctrl = treat
+        self.trackline = False
+        self.d = d              # note, self.d doesn't make sense in PE mode
+        self.ctrl_d_s = ctrl_d_s  # note, self.d doesn't make sense in PE mode
+        self.treat_scaling_factor = treat_scaling_factor
+        self.ctrl_scaling_factor_s = ctrl_scaling_factor_s
+        self.end_shift = end_shift
+        self.lambda_bg = lambda_bg
+        self.pqtable = Float32to32Map(for_int=False)  # Float32 -> Float32 map
+        self.save_bedGraph = save_bedGraph
+        self.save_SPMR = save_SPMR
+        self.bedGraph_filename_prefix = bedGraph_filename_prefix.encode()
+        self.bedGraph_treat_filename = bedGraph_treat_filename.encode()
+        self.bedGraph_control_filename = bedGraph_control_filename.encode()
+        if not self.ctrl_d_s or not self.ctrl_scaling_factor_s:
+            self.no_lambda_flag = True
+        else:
+            self.no_lambda_flag = False
+        self.pseudocount = pseudocount
+        # get the common chromosome names from both treatment and control
+        chr1 = set(self.treat.get_chr_names())
+        chr2 = set(self.ctrl.get_chr_names())
+        self.chromosomes = sorted(list(chr1.intersection(chr2)))
+
+        self.pileup_data_files = {}
+        self.pvalue_length = {}
+        self.pvalue_npeaks = {}
+        # step for optimal cutoff is 0.3 in -log10pvalue, we try from
+        # pvalue 1E-10 (-10logp=10) to 0.5 (-10logp=0.3)
+        for p in np.arange(0.3, 10, 0.3):
+            self.pvalue_length[p] = 0
+            self.pvalue_npeaks[p] = 0
+        self.optimal_p_cutoff = 0
+        self.cutoff_analysis_filename = cutoff_analysis_filename.encode()
+
+    @cython.ccall
+    def destroy(self):
+        """Remove temporary files for pileup values of each chromosome.
+
+        Note: This function MUST be called if the class won: object't
+        be used anymore.
+
+        """
+        f: bytes
+
+        for f in self.pileup_data_files.values():
+            if os.path.isfile(f):
+                os.unlink(f)
+        return
+
+    @cython.ccall
+    def set_pseudocount(self, pseudocount: cython.float):
+        self.pseudocount = pseudocount
+
+    @cython.ccall
+    def enable_trackline(self):
+        """Turn on trackline with bedgraph output
+        """
+        self.trackline = True
+
+    @cython.cfunc
+    def pileup_treat_ctrl_a_chromosome(self, chrom: bytes):
+        """After this function is called, self.chr_pos_treat_ctrl will
+        be reand: set assigned to the pileup values of the given
+        chromosome.
+
+        """
+        treat_pv: list
+        ctrl_pv: list
+        f: object
+        temp_filename: str
+
+        assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom
+
+        # check backup file of pileup values. If not exists, create
+        # it. Otherwise, load them instead of calculating new pileup
+        # values.
+        if chrom in self.pileup_data_files:
+            try:
+                f = open(self.pileup_data_files[chrom], "rb")
+                self.chr_pos_treat_ctrl = cPickle.load(f)
+                f.close()
+                return
+            except Exception:
+                temp_fd, temp_filename = mkstemp()
+                os.close(temp_fd)
+                self.pileup_data_files[chrom] = temp_filename
+        else:
+            temp_fd, temp_filename = mkstemp()
+            os.close(temp_fd)
+            self.pileup_data_files[chrom] = temp_filename.encode()
+
+        # reor: set clean existing self.chr_pos_treat_ctrl
+        if self.chr_pos_treat_ctrl:     # not a beautiful way to clean
+            clean_up_ndarray(self.chr_pos_treat_ctrl[0])
+            clean_up_ndarray(self.chr_pos_treat_ctrl[1])
+            clean_up_ndarray(self.chr_pos_treat_ctrl[2])
+
+        if self.PE_mode:
+            treat_pv = self.treat.pileup_a_chromosome(chrom,
+                                                      [self.treat_scaling_factor,],
+                                                      baseline_value=0.0)
+        else:
+            treat_pv = self.treat.pileup_a_chromosome(chrom,
+                                                      [self.d,],
+                                                      [self.treat_scaling_factor,],
+                                                      baseline_value=0.0,
+                                                      directional=True,
+                                                      end_shift=self.end_shift)
+
+        if not self.no_lambda_flag:
+            if self.PE_mode:
+                # note, we pileup up PE control as SE control because
+                # we assume the bias only can be captured at the
+                # surrounding regions of cutting sites from control experiments.
+                ctrl_pv = self.ctrl.pileup_a_chromosome_c(chrom,
+                                                          self.ctrl_d_s,
+                                                          self.ctrl_scaling_factor_s,
+                                                          baseline_value=self.lambda_bg)
+            else:
+                ctrl_pv = self.ctrl.pileup_a_chromosome(chrom,
+                                                        self.ctrl_d_s,
+                                                        self.ctrl_scaling_factor_s,
+                                                        baseline_value=self.lambda_bg,
+                                                        directional=False)
+        else:
+            # a: set global lambda
+            ctrl_pv = [treat_pv[0][-1:], np.array([self.lambda_bg,],
+                                                  dtype="f4")]
+
+        self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl(treat_pv,
+                                                               ctrl_pv)
+
+        # clean treat_pv and ctrl_pv
+        treat_pv = []
+        ctrl_pv = []
+
+        # save data to temporary file
+        try:
+            f = open(self.pileup_data_files[chrom], "wb")
+            cPickle.dump(self.chr_pos_treat_ctrl, f, protocol=2)
+            f.close()
+        except Exception:
+            # fail to write then remove the key in pileup_data_files
+            self.pileup_data_files.pop(chrom)
+        return
+
+    @cython.cfunc
+    def __chrom_pair_treat_ctrl(self, treat_pv, ctrl_pv) -> list:
+        """*private* Pair treat and ctrl pileup for each region.
+
+        treat_pv and ctrl_pv are [np.ndarray, np.ndarray].
+
+        return [p, t, c] list, each element is a numpy array.
+        """
+        index_ret: cython.long
+        it: cython.long
+        ic: cython.long
+        lt: cython.long
+        lc: cython.long
+        t_p: cnp.ndarray
+        c_p: cnp.ndarray
+        ret_p: cnp.ndarray
+        t_v: cnp.ndarray
+        c_v: cnp.ndarray
+        ret_t: cnp.ndarray
+        ret_c: cnp.ndarray
+        t_p_view: cython.pointer(cython.int)
+        c_p_view: cython.pointer(cython.int)
+        ret_p_view: cython.pointer(cython.int)
+        t_v_view: cython.pointer(cython.float)
+        c_v_view: cython.pointer(cython.float)
+        ret_t_view: cython.pointer(cython.float)
+        ret_c_view: cython.pointer(cython.float)
+
+        [t_p, t_v] = treat_pv
+        [c_p, c_v] = ctrl_pv
+
+        lt = t_p.shape[0]
+        lc = c_p.shape[0]
+
+        chrom_max_len = lt + lc
+
+        ret_p = np.zeros(chrom_max_len, dtype="i4")  # position
+        ret_t = np.zeros(chrom_max_len, dtype="f4")  # value from treatment
+        ret_c = np.zeros(chrom_max_len, dtype="f4")  # value from control
+
+        # t_p_view = t_p #cython.cast(cython.pointer[cython.int], t_p.data)
+        # t_v_view = t_v #cython.cast(cython.pointer[cython.float], t_v.data)
+        # c_p_view = c_p #cython.cast(cython.pointer[cython.int], c_p.data)
+        # c_v_view = c_v #cython.cast(cython.pointer[cython.float], c_v.data)
+        # ret_p_view = ret_p #cython.cast(cython.pointer[cython.int], ret_p.data)
+        # ret_t_view = ret_t #cython.cast(cython.pointer[cython.float], ret_t.data)
+        # ret_c_view = ret_c #cython.cast(cython.pointer[cython.float], ret_c.data)
+
+        t_p_view = cython.cast(cython.pointer(cython.int), t_p.data)
+        t_v_view = cython.cast(cython.pointer(cython.float), t_v.data)
+        c_p_view = cython.cast(cython.pointer(cython.int), c_p.data)
+        c_v_view = cython.cast(cython.pointer(cython.float), c_v.data)
+        ret_p_view = cython.cast(cython.pointer(cython.int), ret_p.data)
+        ret_t_view = cython.cast(cython.pointer(cython.float), ret_t.data)
+        ret_c_view = cython.cast(cython.pointer(cython.float), ret_c.data)
+
+        index_ret = 0
+        it = 0
+        ic = 0
+
+        while it < lt and ic < lc:
+            if t_p_view[0] < c_p_view[0]:
+                # clip a region from pre_p to p1, then pre_p: set as p1.
+                ret_p_view[0] = t_p_view[0]
+                ret_t_view[0] = t_v_view[0]
+                ret_c_view[0] = c_v_view[0]
+                ret_p_view += 1
+                ret_t_view += 1
+                ret_c_view += 1
+                index_ret += 1
+                # call for the next p1 and v1
+                it += 1
+                t_p_view += 1
+                t_v_view += 1
+            elif t_p_view[0] > c_p_view[0]:
+                # clip a region from pre_p to p2, then pre_p: set as p2.
+                ret_p_view[0] = c_p_view[0]
+                ret_t_view[0] = t_v_view[0]
+                ret_c_view[0] = c_v_view[0]
+                ret_p_view += 1
+                ret_t_view += 1
+                ret_c_view += 1
+                index_ret += 1
+                # call for the next p2 and v2
+                ic += 1
+                c_p_view += 1
+                c_v_view += 1
+            else:
+                # from pre_p to p1 or p2, then pre_p: set as p1 or p2.
+                ret_p_view[0] = t_p_view[0]
+                ret_t_view[0] = t_v_view[0]
+                ret_c_view[0] = c_v_view[0]
+                ret_p_view += 1
+                ret_t_view += 1
+                ret_c_view += 1
+                index_ret += 1
+                # call for the next p1, v1, p2, v2.
+                it += 1
+                ic += 1
+                t_p_view += 1
+                t_v_view += 1
+                c_p_view += 1
+                c_v_view += 1
+
+        ret_p.resize(index_ret, refcheck=False)
+        ret_t.resize(index_ret, refcheck=False)
+        ret_c.resize(index_ret, refcheck=False)
+        return [ret_p, ret_t, ret_c]
+
+    @cython.cfunc
+    def __cal_score(self,
+                    array1: cnp.ndarray(cython.float, ndim=1),
+                    array2: cnp.ndarray(cython.float, ndim=1),
+                    cal_func) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray(cython.float, ndim=1)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+        for i in range(array1.shape[0]):
+            s[i] = cal_func(array1[i], array2[i])
+        return s
+
+    @cython.cfunc
+    def __cal_pvalue_qvalue_table(self):
+        """After this function is called, self.pqtable is built. All
+        chromosomes will be iterated. So it will take some time.
+
+        """
+        chrom: bytes
+        pos_array: cnp.ndarray
+        treat_array: cnp.ndarray
+        ctrl_array: cnp.ndarray
+        pscore_stat: dict
+        pre_p: cython.long
+        # pre_l: cython.long
+        l: cython.long
+        i: cython.long
+        j: cython.long
+        this_v: cython.float
+        # pre_v: cython.float
+        v: cython.float
+        q: cython.float
+        pre_q: cython.float
+        N: cython.long
+        k: cython.long
+        this_l: cython.long
+        f: cython.float
+        unique_values: list
+        pos_view: cython.pointer(cython.int)
+        treat_value_view: cython.pointer(cython.float)
+        ctrl_value_view: cython.pointer(cython.float)
+
+        debug("Start to calculate pvalue stat...")
+
+        pscore_stat = {}        # dict()
+        for i in range(len(self.chromosomes)):
+            chrom = self.chromosomes[i]
+            pre_p = 0
+
+            self.pileup_treat_ctrl_a_chromosome(chrom)
+            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
+
+            pos_view = cython.cast(cython.pointer(cython.int),
+                                   pos_array.data)
+            treat_value_view = cython.cast(cython.pointer(cython.float),
+                                           treat_array.data)
+            ctrl_value_view = cython.cast(cython.pointer(cython.float),
+                                          ctrl_array.data)
+
+            for j in range(pos_array.shape[0]):
+                this_v = get_pscore((cython.cast(cython.int,
+                                                 treat_value_view[0]),
+                                     ctrl_value_view[0]))
+                this_l = pos_view[0] - pre_p
+                if this_v in pscore_stat:
+                    pscore_stat[this_v] += this_l
+                else:
+                    pscore_stat[this_v] = this_l
+                pre_p = pos_view[0]
+                pos_view += 1
+                treat_value_view += 1
+                ctrl_value_view += 1
+
+        N = sum(pscore_stat.values())  # total length
+        k = 1                          # rank
+        f = -log10(N)
+        # pre_v = -2147483647
+        # pre_l = 0
+        pre_q = 2147483647      # save the previous q-value
+
+        self.pqtable = Float32to32Map(for_int=False)
+        unique_values = sorted(list(pscore_stat.keys()), reverse=True)
+        for i in range(len(unique_values)):
+            v = unique_values[i]
+            l = pscore_stat[v]
+            q = v + (log10(k) + f)
+            if q > pre_q:
+                q = pre_q
+            if q <= 0:
+                q = 0
+                break
+            #q = max(0,min(pre_q,q))           # make q-score monotonic
+            self.pqtable[v] = q
+            pre_q = q
+            k += l
+        # bottom rank pscores all have qscores 0
+        for j in range(i, len(unique_values)):
+            v = unique_values[j]
+            self.pqtable[v] = 0
+        return
+
+    @cython.cfunc
+    def __pre_computes(self,
+                       max_gap: cython.int = 50,
+                       min_length: cython.int = 200):
+        """After this function is called, self.pqtable and self.pvalue_length is built. All
+        chromosomes will be iterated. So it will take some time.
+
+        """
+        chrom: bytes
+        pos_array: cnp.ndarray
+        treat_array: cnp.ndarray
+        ctrl_array: cnp.ndarray
+        score_array: cnp.ndarray
+        pscore_stat: dict
+        n: cython.long
+        pre_p: cython.long
+        this_p: cython.long
+        j: cython.long
+        l: cython.long
+        i: cython.long
+        q: cython.float
+        pre_q: cython.float
+        this_v: cython.float
+        v: cython.float
+        cutoff: cython.float
+        N: cython.long
+        k: cython.long
+        this_l: cython.long
+        f: cython.float
+        unique_values: list
+        above_cutoff: cnp.ndarray
+        above_cutoff_endpos: cnp.ndarray
+        above_cutoff_startpos: cnp.ndarray
+        peak_content: list
+        peak_length: cython.long
+        total_l: cython.long
+        total_p: cython.long
+        tmplist: list
+
+        # above cutoff start position pointer
+        acs_ptr: cython.pointer(cython.int)
+        # above cutoff end position pointer
+        ace_ptr: cython.pointer(cython.int)
+        # position array pointer
+        pos_array_ptr: cython.pointer(cython.int)
+        # score array pointer
+        score_array_ptr: cython.pointer(cython.float)
+
+        debug("Start to calculate pvalue stat...")
+
+        # tmpcontains: list a of: list log pvalue cutoffs from 0.3 to 10
+        tmplist = [round(x, 5)
+                   for x in sorted(list(np.arange(0.3, 10.0, 0.3)),
+                                   reverse=True)]
+
+        pscore_stat = {}      # dict()
+        # print (list(pscore_stat.keys()))
+        # print (list(self.pvalue_length.keys()))
+        # print (list(self.pvalue_npeaks.keys()))
+        for i in range(len(self.chromosomes)):
+            chrom = self.chromosomes[i]
+            self.pileup_treat_ctrl_a_chromosome(chrom)
+            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
+
+            score_array = self.__cal_pscore(treat_array, ctrl_array)
+
+            for n in range(len(tmplist)):
+                cutoff = tmplist[n]
+                total_l = 0           # total length in potential peak
+                total_p = 0
+
+                # get the regions with scores above cutoffs this is
+                # not an optimized method. It would be better to store
+                # score array in a 2-D ndarray?
+                above_cutoff = np.nonzero(score_array > cutoff)[0]
+                # end positions of regions where score is above cutoff
+                above_cutoff_endpos = pos_array[above_cutoff]
+                # start positions of regions where score is above cutoff
+                above_cutoff_startpos = pos_array[above_cutoff-1]
+
+                if above_cutoff_endpos.size == 0:
+                    continue
+
+                # first bit of region above cutoff
+                acs_ptr = cython.cast(cython.pointer(cython.int),
+                                      above_cutoff_startpos.data)
+                ace_ptr = cython.cast(cython.pointer(cython.int),
+                                      above_cutoff_endpos.data)
+
+                peak_content = [(acs_ptr[0], ace_ptr[0]),]
+                lastp = ace_ptr[0]
+                acs_ptr += 1
+                ace_ptr += 1
+
+                for i in range(1, above_cutoff_startpos.size):
+                    tl = acs_ptr[0] - lastp
+                    if tl <= max_gap:
+                        peak_content.append((acs_ptr[0], ace_ptr[0]))
+                    else:
+                        peak_length = peak_content[-1][1] - peak_content[0][0]
+                        # if the peak is too small, reject it
+                        if peak_length >= min_length:
+                            total_l += peak_length
+                            total_p += 1
+                        peak_content = [(acs_ptr[0], ace_ptr[0]),]
+                    lastp = ace_ptr[0]
+                    acs_ptr += 1
+                    ace_ptr += 1
+
+                if peak_content:
+                    peak_length = peak_content[-1][1] - peak_content[0][0]
+                    # if the peak is too small, reject it
+                    if peak_length >= min_length:
+                        total_l += peak_length
+                        total_p += 1
+                self.pvalue_length[cutoff] = self.pvalue_length.get(cutoff, 0) + total_l
+                self.pvalue_npeaks[cutoff] = self.pvalue_npeaks.get(cutoff, 0) + total_p
+
+            pos_array_ptr = cython.cast(cython.pointer(cython.int),
+                                        pos_array.data)
+            score_array_ptr = cython.cast(cython.pointer(cython.float),
+                                          score_array.data)
+
+            pre_p = 0
+            for i in range(pos_array.shape[0]):
+                this_p = pos_array_ptr[0]
+                this_l = this_p - pre_p
+                this_v = score_array_ptr[0]
+                if this_v in pscore_stat:
+                    pscore_stat[this_v] += this_l
+                else:
+                    pscore_stat[this_v] = this_l
+                pre_p = this_p  # pos_array[i]
+                pos_array_ptr += 1
+                score_array_ptr += 1
+
+        # debug ("make pscore_stat cost %.5f seconds" % t)
+
+        # add all pvalue cutoffs from cutoff-analysis part. So that we
+        # can get the corresponding qvalues for them.
+        for cutoff in tmplist:
+            if cutoff not in pscore_stat:
+                pscore_stat[cutoff] = 0
+
+        N = sum(pscore_stat.values())  # total length
+        k = 1                           # rank
+        f = -log10(N)
+        pre_q = 2147483647              # save the previous q-value
+
+        self.pqtable = Float32to32Map(for_int=False)  # {}
+        # sorted(unique_values,reverse=True)
+        unique_values = sorted(list(pscore_stat.keys()), reverse=True)
+        for i in range(len(unique_values)):
+            v = unique_values[i]
+            l = pscore_stat[v]
+            q = v + (log10(k) + f)
+            if q > pre_q:
+                q = pre_q
+            if q <= 0:
+                q = 0
+                break
+            # q = max(0,min(pre_q,q))           # make q-score monotonic
+            self.pqtable[v] = q
+            pre_q = q
+            k += l
+        for j in range(i, len(unique_values)):
+            v = unique_values[j]
+            self.pqtable[v] = 0
+
+        # write pvalue and total length of predicted peaks
+        # this is the output from cutoff-analysis
+        fhd = open(self.cutoff_analysis_filename, "w")
+        fhd.write("pscore\tqscore\tnpeaks\tlpeaks\tavelpeak\n")
+        x = []
+        y = []
+        for cutoff in tmplist:
+            if self.pvalue_npeaks[cutoff] > 0:
+                fhd.write("%.2f\t%.2f\t%d\t%d\t%.2f\n" %
+                          (cutoff, self.pqtable[cutoff],
+                           self.pvalue_npeaks[cutoff],
+                           self.pvalue_length[cutoff],
+                           self.pvalue_length[cutoff]/self.pvalue_npeaks[cutoff]))
+                x.append(cutoff)
+                y.append(self.pvalue_length[cutoff])
+        fhd.close()
+        info("#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename)
+        # info("#3 Suggest a cutoff...")
+        # optimal_cutoff, optimal_length = find_optimal_cutoff(x, y)
+        # info("#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % (optimal_cutoff, optimal_length))
+        # print (list(pqtable.keys()))
+        # print (list(self.pvalue_length.keys()))
+        # print (list(self.pvalue_npeaks.keys()))
+        return
+
+    @cython.ccall
+    def call_peaks(self,
+                   scoring_function_symbols: list,
+                   score_cutoff_s: list,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 50,
+                   call_summits: bool = False,
+                   cutoff_analysis: bool = False):
+        """Call peaks for all chromosomes. Return a PeakIO object.
+
+        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
+        score_cutoff_s    : cutoff values corresponding to scoring functions
+        min_length        : minimum length of peak
+        max_gap           : maximum gap of 'insignificant' regions within a peak. Note, for PE_mode, max_gap and max_length are both as: set fragment length.
+        call_summits      : boolean. Whether or not call sub-peaks.
+        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
+        """
+        chrom: bytes
+        tmp_bytes: bytes
+
+        peaks = PeakIO()
+
+        # prepare p-q table
+        if len(self.pqtable) == 0:
+            info("#3 Pre-compute pvalue-qvalue table...")
+            if cutoff_analysis:
+                info("#3 Cutoff vs peaks called will be analyzed!")
+                self.__pre_computes(max_gap=max_gap, min_length=min_length)
+            else:
+                self.__cal_pvalue_qvalue_table()
+
+
+        # prepare bedGraph file
+        if self.save_bedGraph:
+            self.bedGraph_treat_f = fopen(self.bedGraph_treat_filename, "w")
+            self.bedGraph_ctrl_f = fopen(self.bedGraph_control_filename, "w")
+
+            info("#3 In the peak calling step, the following will be performed simultaneously:")
+            info("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" %
+                 self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
+            info("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" %
+                 self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
+
+            if self.save_SPMR:
+                info("#3   --SPMR is requested, so pileup will be normalized by sequencing depth in million reads.")
+            elif self.treat_scaling_factor == 1:
+                info("#3   Pileup will be based on sequencing depth in treatment.")
+            else:
+                info("#3   Pileup will be based on sequencing depth in control.")
+
+            if self.trackline:
+                # this line is REQUIRED by the wiggle format for UCSC browser
+                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
+                fprintf(self.bedGraph_treat_f, tmp_bytes)
+                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
+                fprintf(self.bedGraph_ctrl_f, tmp_bytes)
+
+        info("#3 Call peaks for each chromosome...")
+        for chrom in self.chromosomes:
+            # treat/control bedGraph will be saved if requested by user.
+            self.__chrom_call_peak_using_certain_criteria(peaks,
+                                                          chrom,
+                                                          scoring_function_symbols,
+                                                          score_cutoff_s,
+                                                          min_length,
+                                                          max_gap,
+                                                          call_summits,
+                                                          self.save_bedGraph)
+
+        # close bedGraph file
+        if self.save_bedGraph:
+            fclose(self.bedGraph_treat_f)
+            fclose(self.bedGraph_ctrl_f)
+            self.save_bedGraph = False
+
+        return peaks
+
+    @cython.cfunc
+    def __chrom_call_peak_using_certain_criteria(self,
+                                                 peaks,
+                                                 chrom: bytes,
+                                                 scoring_function_s: list,
+                                                 score_cutoff_s: list,
+                                                 min_length: cython.int,
+                                                 max_gap: cython.int,
+                                                 call_summits: bool,
+                                                 save_bedGraph: bool):
+        """ Call peaks for a chromosome.
+
+        Combination of criteria is allowed here.
+
+        peaks: a PeakIO object, the return value of this function
+        scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
+        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
+        """
+        i: cython.int
+        s: str
+        above_cutoff: cnp.ndarray
+        above_cutoff_endpos: cnp.ndarray
+        above_cutoff_startpos: cnp.ndarray
+        pos_array: cnp.ndarray
+        above_cutoff_index_array: cnp.ndarray
+        treat_array: cnp.ndarray
+        ctrl_array: cnp.ndarray
+        score_array_s: list  # to: list keep different types of scores
+        peak_content: list           # to store information for a
+        #  chunk in a peak region, it
+        #  contains lists of: 1. left
+        #  position; 2. right
+        #  position; 3. treatment
+        #  value; 4. control value;
+        #  5. of: list scores at this
+        #  chunk
+        tl: cython.long
+        lastp: cython.long
+        ts: cython.long
+        te: cython.long
+        ti: cython.long
+        tp: cython.float
+        cp: cython.float
+        acs_ptr: cython.pointer(cython.int)
+        ace_ptr: cython.pointer(cython.int)
+        acia_ptr: cython.pointer(cython.int)
+        treat_array_ptr: cython.pointer(cython.float)
+        ctrl_array_ptr: cython.pointer(cython.float)
+
+        assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!"
+
+        peak_content = []           # to store points above cutoff
+
+        # first, build pileup, self.chr_pos_treat_ctrl
+        # this step will be speeped up if pqtable is pre-computed.
+        self.pileup_treat_ctrl_a_chromosome(chrom)
+        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
+
+        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
+        if save_bedGraph:
+            self.__write_bedGraph_for_a_chromosome(chrom)
+
+        # keep all types of scores needed
+        # t0 = ttime()
+        score_array_s = []
+        for i in range(len(scoring_function_s)):
+            s = scoring_function_s[i]
+            if s == 'p':
+                score_array_s.append(self.__cal_pscore(treat_array,
+                                                       ctrl_array))
+            elif s == 'q':
+                score_array_s.append(self.__cal_qscore(treat_array,
+                                                       ctrl_array))
+            elif s == 'f':
+                score_array_s.append(self.__cal_FE(treat_array,
+                                                   ctrl_array))
+            elif s == 's':
+                score_array_s.append(self.__cal_subtraction(treat_array,
+                                                            ctrl_array))
+
+        # get the regions with scores above cutoffs. this is not an
+        # optimized method. It would be better to store score array in
+        # a 2-D ndarray?
+        above_cutoff = np.nonzero(apply_multiple_cutoffs(score_array_s,
+                                                         score_cutoff_s))[0]
+        # indices
+        above_cutoff_index_array = np.arange(pos_array.shape[0],
+                                             dtype="i4")[above_cutoff]
+        # end positions of regions where score is above cutoff
+        above_cutoff_endpos = pos_array[above_cutoff]
+        # start positions of regions where score is above cutoff
+        above_cutoff_startpos = pos_array[above_cutoff-1]
+
+        if above_cutoff.size == 0:
+            # nothing above cutoff
+            return
+
+        if above_cutoff[0] == 0:
+            # first element > cutoff, fix the first point as
+            # 0. otherwise it would be the last item in
+            # data[chrom]['pos']
+            above_cutoff_startpos[0] = 0
+
+        # print "apply cutoff -- chrom:",chrom,"  time:", ttime() - t0
+        # start to build peak regions
+        # t0 = ttime()
+
+        # first bit of region above cutoff
+        acs_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_startpos.data)
+        ace_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_endpos.data)
+        acia_ptr = cython.cast(cython.pointer(cython.int),
+                               above_cutoff_index_array.data)
+        treat_array_ptr = cython.cast(cython.pointer(cython.float),
+                                      treat_array.data)
+        ctrl_array_ptr = cython.cast(cython.pointer(cython.float),
+                                     ctrl_array.data)
+
+        ts = acs_ptr[0]
+        te = ace_ptr[0]
+        ti = acia_ptr[0]
+        tp = treat_array_ptr[ti]
+        cp = ctrl_array_ptr[ti]
+
+        peak_content.append((ts, te, tp, cp, ti))
+        lastp = te
+        acs_ptr += 1
+        ace_ptr += 1
+        acia_ptr += 1
+
+        for i in range(1, above_cutoff_startpos.shape[0]):
+            ts = acs_ptr[0]
+            te = ace_ptr[0]
+            ti = acia_ptr[0]
+            acs_ptr += 1
+            ace_ptr += 1
+            acia_ptr += 1
+            tp = treat_array_ptr[ti]
+            cp = ctrl_array_ptr[ti]
+            tl = ts - lastp
+            if tl <= max_gap:
+                # append.
+                peak_content.append((ts, te, tp, cp, ti))
+                lastp = te      # above_cutoff_endpos[i]
+            else:
+                # close
+                if call_summits:
+                    # smooth length is min_length, i.e. fragment size 'd'
+                    self.__close_peak_with_subpeaks(peak_content,
+                                                    peaks,
+                                                    min_length,
+                                                    chrom,
+                                                    min_length,
+                                                    score_array_s,
+                                                    score_cutoff_s=score_cutoff_s)
+                else:
+                    # smooth length is min_length, i.e. fragment size 'd'
+                    self.__close_peak_wo_subpeaks(peak_content,
+                                                  peaks,
+                                                  min_length,
+                                                  chrom,
+                                                  min_length,
+                                                  score_array_s,
+                                                  score_cutoff_s=score_cutoff_s)
+                peak_content = [(ts, te, tp, cp, ti),]
+                lastp = te      # above_cutoff_endpos[i]
+        # save the last peak
+        if not peak_content:
+            return
+        else:
+            if call_summits:
+                # smooth length is min_length, i.e. fragment size 'd'
+                self.__close_peak_with_subpeaks(peak_content,
+                                                peaks,
+                                                min_length,
+                                                chrom,
+                                                min_length,
+                                                score_array_s,
+                                                score_cutoff_s=score_cutoff_s)
+            else:
+                # smooth length is min_length, i.e. fragment size 'd'
+                self.__close_peak_wo_subpeaks(peak_content,
+                                              peaks,
+                                              min_length,
+                                              chrom,
+                                              min_length,
+                                              score_array_s,
+                                              score_cutoff_s=score_cutoff_s)
+
+        # print "close peaks -- chrom:",chrom,"  time:", ttime() - t0
+        return
+
+    @cython.cfunc
+    def __close_peak_wo_subpeaks(self,
+                                 peak_content: list,
+                                 peaks,
+                                 min_length: cython.int,
+                                 chrom: bytes,
+                                 smoothlen: cython.int,
+                                 score_array_s: list,
+                                 score_cutoff_s: list = []) -> bool:
+        """Close the peak region, output peak boundaries, peak summit
+        and scores, then add the peak to peakIO object.
+
+        peak_content contains [start, end, treat_p, ctrl_p, index_in_score_array]
+
+        peaks: a PeakIO object
+
+        """
+        summit_pos: cython.int
+        tstart: cython.int
+        tend: cython.int
+        summit_index: cython.int
+        i: cython.int
+        midindex: cython.int
+        ttreat_p: cython.double
+        tctrl_p: cython.double
+        tscore: cython.double
+        summit_treat: cython.double
+        summit_ctrl: cython.double
+        summit_p_score: cython.double
+        summit_q_score: cython.double
+        tlist_scores_p: cython.int
+
+        peak_length = peak_content[-1][1] - peak_content[0][0]
+        if peak_length >= min_length:  # if the peak is too small, reject it
+            tsummit = []
+            summit_pos = 0
+            summit_value = 0
+            for i in range(len(peak_content)):
+                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
+                tscore = ttreat_p  # use pscore as general score to find summit
+                if not summit_value or summit_value < tscore:
+                    tsummit = [(tend + tstart) // 2,]
+                    tsummit_index = [i,]
+                    summit_value = tscore
+                elif summit_value == tscore:
+                    # remember continuous summit values
+                    tsummit.append((tend + tstart) // 2)
+                    tsummit_index.append(i)
+            # the middle of all highest points in peak region is defined as summit
+            midindex = (len(tsummit) + 1) // 2 - 1
+            summit_pos = tsummit[midindex]
+            summit_index = tsummit_index[midindex]
+
+            summit_treat = peak_content[summit_index][2]
+            summit_ctrl = peak_content[summit_index][3]
+
+            # this is a double-check to see if the summit can pass cutoff values.
+            for i in range(len(score_cutoff_s)):
+                if score_cutoff_s[i] > score_array_s[i][peak_content[summit_index][4]]:
+                    return False  # not passed, then disgard this peak.
+
+            summit_p_score = pscore_dict[(cython.cast(cython.int,
+                                                      summit_treat),
+                                          summit_ctrl)]
+            summit_q_score = self.pqtable[summit_p_score]
+
+            peaks.add(chrom,           # chromosome
+                      peak_content[0][0],           # start
+                      peak_content[-1][1],          # end
+                      summit=summit_pos,     # summit position
+                      peak_score=summit_q_score,  # score at summit
+                      pileup=summit_treat,    # pileup
+                      pscore=summit_p_score,  # pvalue
+                      fold_change=(summit_treat + self.pseudocount) / (summit_ctrl + self.pseudocount),  # fold change
+                      qscore=summit_q_score  # qvalue
+                      )
+            # start a new peak
+            return True
+
+    @cython.cfunc
+    def __close_peak_with_subpeaks(self,
+                                   peak_content: list,
+                                   peaks,
+                                   min_length: cython.int,
+                                   chrom: bytes,
+                                   smoothlen: cython.int,
+                                   score_array_s: list,
+                                   score_cutoff_s: list = [],
+                                   min_valley: cython.float = 0.9) -> bool:
+        """Algorithm implemented by Ben, to profile the pileup signals
+        within a peak region then find subpeak summits. This method is
+        highly recommended for TFBS or DNAase I sites.
+
+        """
+        tstart: cython.int
+        tend: cython.int
+        summit_index: cython.int
+        summit_offset: cython.int
+        start: cython.int
+        end: cython.int
+        i: cython.int
+        start_boundary: cython.int
+        m: cython.int
+        n: cython.int
+        ttreat_p: cython.double
+        tctrl_p: cython.double
+        tscore: cython.double
+        summit_treat: cython.double
+        summit_ctrl: cython.double
+        summit_p_score: cython.double
+        summit_q_score: cython.double
+        peakdata: cnp.ndarray(cython.float, ndim=1)
+        peakindices: cnp.ndarray(cython.int, ndim=1)
+        summit_offsets: cnp.ndarray(cython.int, ndim=1)
+        tlist_scores_p: cython.int
+
+        peak_length = peak_content[-1][1] - peak_content[0][0]
+
+        if peak_length < min_length:
+            return  # if the region is too small, reject it
+
+        # Add 10 bp padding to peak region so that we can get true minima
+        end = peak_content[-1][1] + 10
+        start = peak_content[0][0] - 10
+        if start < 0:
+            # this is the offof: set original peak boundary in peakdata list.
+            start_boundary = 10 + start
+            start = 0
+        else:
+            # this is the offof: set original peak boundary in peakdata list.
+            start_boundary = 10
+
+        # save the scores (qscore) for each position in this region
+        peakdata = np.zeros(end - start, dtype='f4')
+        # save the indices for each position in this region
+        peakindices = np.zeros(end - start, dtype='i4')
+        for i in range(len(peak_content)):
+            (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
+            tscore = ttreat_p  # use pileup as general score to find summit
+            m = tstart - start + start_boundary
+            n = tend - start + start_boundary
+            peakdata[m:n] = tscore
+            peakindices[m:n] = i
+
+        # offsets are the indices for summits in peakdata/peakindices array.
+        summit_offsets = maxima(peakdata, smoothlen)
+
+        if summit_offsets.shape[0] == 0:
+            # **failsafe** if no summits, fall back on old approach #
+            return self.__close_peak_wo_subpeaks(peak_content,
+                                                 peaks,
+                                                 min_length,
+                                                 chrom,
+                                                 smoothlen,
+                                                 score_array_s,
+                                                 score_cutoff_s)
+        else:
+            # remove maxima that occurred in padding
+            m = np.searchsorted(summit_offsets,
+                                start_boundary)
+            n = np.searchsorted(summit_offsets,
+                                peak_length + start_boundary,
+                                'right')
+            summit_offsets = summit_offsets[m:n]
+
+        summit_offsets = enforce_peakyness(peakdata, summit_offsets)
+
+        # print "enforced:",summit_offsets
+        if summit_offsets.shape[0] == 0:
+            # **failsafe** if no summits, fall back on old approach #
+            return self.__close_peak_wo_subpeaks(peak_content,
+                                                 peaks,
+                                                 min_length,
+                                                 chrom,
+                                                 smoothlen,
+                                                 score_array_s,
+                                                 score_cutoff_s)
+
+        # indices are those point to peak_content
+        summit_indices = peakindices[summit_offsets]
+
+        summit_offsets -= start_boundary
+
+        for summit_offset, summit_index in list(zip(summit_offsets,
+                                                    summit_indices)):
+
+            summit_treat = peak_content[summit_index][2]
+            summit_ctrl = peak_content[summit_index][3]
+
+            summit_p_score = pscore_dict[(cython.cast(cython.int,
+                                                      summit_treat),
+                                          summit_ctrl)]
+            summit_q_score = self.pqtable[summit_p_score]
+
+            for i in range(len(score_cutoff_s)):
+                if score_cutoff_s[i] > score_array_s[i][peak_content[summit_index][4]]:
+                    return False  # not passed, then disgard this summit.
+
+            peaks.add(chrom,
+                      peak_content[0][0],
+                      peak_content[-1][1],
+                      summit=start + summit_offset,
+                      peak_score=summit_q_score,
+                      pileup=summit_treat,
+                      pscore=summit_p_score,
+                      fold_change=(summit_treat + self.pseudocount) / (summit_ctrl + self.pseudocount),  # fold change
+                      qscore=summit_q_score
+                      )
+        # start a new peak
+        return True
+
+    @cython.cfunc
+    def __cal_pscore(self,
+                     array1: cnp.ndarray,
+                     array2: cnp.ndarray) -> cnp.ndarray:
+
+        i: cython.long
+        array1_size: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        array1_size = array1.shape[0]
+
+        for i in range(array1_size):
+            s_ptr[0] = get_pscore((cython.cast(cython.int,
+                                               a1_ptr[0]),
+                                   a2_ptr[0]))
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __cal_qscore(self,
+                     array1: cnp.ndarray,
+                     array2: cnp.ndarray) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        for i in range(array1.shape[0]):
+            s_ptr[0] = self.pqtable[get_pscore((cython.cast(cython.int,
+                                                            a1_ptr[0]),
+                                                a2_ptr[0]))]
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __cal_logLR(self,
+                    array1: cnp.ndarray,
+                    array2: cnp.ndarray) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        for i in range(array1.shape[0]):
+            s_ptr[0] = get_logLR_asym((a1_ptr[0] + self.pseudocount,
+                                       a2_ptr[0] + self.pseudocount))
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __cal_logFE(self,
+                    array1: cnp.ndarray,
+                    array2: cnp.ndarray) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        for i in range(array1.shape[0]):
+            s_ptr[0] = get_logFE(a1_ptr[0] + self.pseudocount,
+                                 a2_ptr[0] + self.pseudocount)
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __cal_FE(self,
+                 array1: cnp.ndarray,
+                 array2: cnp.ndarray) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        for i in range(array1.shape[0]):
+            s_ptr[0] = (a1_ptr[0] + self.pseudocount) / (a2_ptr[0] + self.pseudocount)
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __cal_subtraction(self,
+                          array1: cnp.ndarray,
+                          array2: cnp.ndarray) -> cnp.ndarray:
+        i: cython.long
+        s: cnp.ndarray
+        a1_ptr: cython.pointer(cython.float)
+        a2_ptr: cython.pointer(cython.float)
+        s_ptr: cython.pointer(cython.float)
+
+        assert array1.shape[0] == array2.shape[0]
+        s = np.zeros(array1.shape[0], dtype="f4")
+
+        a1_ptr = cython.cast(cython.pointer(cython.float), array1.data)
+        a2_ptr = cython.cast(cython.pointer(cython.float), array2.data)
+        s_ptr = cython.cast(cython.pointer(cython.float), s.data)
+
+        for i in range(array1.shape[0]):
+            s_ptr[0] = a1_ptr[0] - a2_ptr[0]
+            s_ptr += 1
+            a1_ptr += 1
+            a2_ptr += 1
+        return s
+
+    @cython.cfunc
+    def __write_bedGraph_for_a_chromosome(self, chrom: bytes) -> bool:
+        """Write treat/control values for a certain chromosome into a
+        specified file handler.
+
+        """
+        pos_array: cnp.ndarray
+        treat_array: cnp.ndarray
+        ctrl_array: cnp.ndarray
+        pos_array_ptr: cython.pointer(cython.int)
+        treat_array_ptr: cython.pointer(cython.float)
+        ctrl_array_ptr: cython.pointer(cython.float)
+        l: cython.int
+        i: cython.int
+        p: cython.int
+        pre_p_t: cython.int
+        # current position, previous position for treat, previous position for control
+        pre_p_c: cython.int
+        pre_v_t: cython.float
+        pre_v_c: cython.float
+        v_t: cython.float
+        # previous value for treat, for control, current value for treat, for control
+        v_c: cython.float
+        # 1 if save_SPMR is false, or depth in million if save_SPMR is
+        # true. Note, while piling up and calling peaks, treatment and
+        # control have been scaled to the same depth, so we need to
+        # find what this 'depth' is.
+        denominator: cython.float
+        ft: cython.pointer(FILE)
+        fc: cython.pointer(FILE)
+
+        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
+        pos_array_ptr = cython.cast(cython.pointer(cython.int),
+                                    pos_array.data)
+        treat_array_ptr = cython.cast(cython.pointer(cython.float),
+                                      treat_array.data)
+        ctrl_array_ptr = cython.cast(cython.pointer(cython.float),
+                                     ctrl_array.data)
+
+        if self.save_SPMR:
+            if self.treat_scaling_factor == 1:
+                # in this case, control has been asked to be scaled to depth of treatment
+                denominator = self.treat.total/1e6
+            else:
+                # in this case, treatment has been asked to be scaled to depth of control
+                denominator = self.ctrl.total/1e6
+        else:
+            denominator = 1.0
+
+        l = pos_array.shape[0]
+
+        if l == 0:              # if there is no data, return
+            return False
+
+        ft = self.bedGraph_treat_f
+        fc = self.bedGraph_ctrl_f
+        # t_write_func = self.bedGraph_treat.write
+        # c_write_func = self.bedGraph_ctrl.write
+
+        pre_p_t = 0
+        pre_p_c = 0
+        pre_v_t = treat_array_ptr[0]/denominator
+        pre_v_c = ctrl_array_ptr[0]/denominator
+        treat_array_ptr += 1
+        ctrl_array_ptr += 1
+
+        for i in range(1, l):
+            v_t = treat_array_ptr[0]/denominator
+            v_c = ctrl_array_ptr[0]/denominator
+            p = pos_array_ptr[0]
+            pos_array_ptr += 1
+            treat_array_ptr += 1
+            ctrl_array_ptr += 1
+
+            if abs(pre_v_t - v_t) > 1e-5:  # precision is 5 digits
+                fprintf(ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t)
+                pre_v_t = v_t
+                pre_p_t = p
+
+            if abs(pre_v_c - v_c) > 1e-5:  # precision is 5 digits
+                fprintf(fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c)
+                pre_v_c = v_c
+                pre_p_c = p
+
+        p = pos_array_ptr[0]
+        # last one
+        fprintf(ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t)
+        fprintf(fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c)
+
+        return True
+
+    @cython.ccall
+    def call_broadpeaks(self,
+                        scoring_function_symbols: list,
+                        lvl1_cutoff_s: list,
+                        lvl2_cutoff_s: list,
+                        min_length: cython.int = 200,
+                        lvl1_max_gap: cython.int = 50,
+                        lvl2_max_gap: cython.int = 400,
+                        cutoff_analysis: bool = False):
+        """This function try to find enriched regions within which,
+        scores are continuously higher than a given cutoff for level
+        1, and link them using the gap above level 2 cutoff with a
+        maximum length of lvl2_max_gap.
+
+        scoring_function_s: symbols of functions to calculate
+        score. 'p' for pscore, 'q' for qscore, 'f' for fold change,
+        's' for subtraction. for example: ['p', 'q']
+
+        lvl1_cutoff_s: of: list cutoffs at highly enriched regions,
+        corresponding to scoring functions.
+
+        lvl2_cutoff_s: of: list cutoffs at less enriched regions,
+        corresponding to scoring functions.
+
+        min_length :    minimum peak length, default 200.
+
+        lvl1_max_gap : maximum gap to merge nearby enriched peaks,
+        default 50.
+
+        lvl2_max_gap   :  maximum length of linkage regions, default 400.
+
+        Return both general PeakIO for: object highly enriched regions
+        and gapped broad regions in BroadPeakIO.
+
+        """
+        i: cython.int
+        j: cython.int
+        chrom: bytes
+        lvl1peaks: object
+        lvl1peakschrom: object
+        lvl1: object
+        lvl2peaks: object
+        lvl2peakschrom: object
+        lvl2: object
+        broadpeaks: object
+        chrs: set
+        tmppeakset: list
+
+        lvl1peaks = PeakIO()
+        lvl2peaks = PeakIO()
+
+        # prepare p-q table
+        if len(self.pqtable) == 0:
+            info("#3 Pre-compute pvalue-qvalue table...")
+            if cutoff_analysis:
+                info("#3 Cutoff value vs broad region calls will be analyzed!")
+                self.__pre_computes(max_gap=lvl2_max_gap, min_length=min_length)
+            else:
+                self.__cal_pvalue_qvalue_table()
+
+        # prepare bedGraph file
+        if self.save_bedGraph:
+
+            self.bedGraph_treat_f = fopen(self.bedGraph_treat_filename, "w")
+            self.bedGraph_ctrl_f = fopen(self.bedGraph_control_filename, "w")
+            info("#3 In the peak calling step, the following will be performed simultaneously:")
+            info("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
+            info("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
+
+            if self.trackline:
+                # this line is REQUIRED by the wiggle format for UCSC browser
+                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
+                fprintf(self.bedGraph_treat_f, tmp_bytes)
+                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
+                fprintf(self.bedGraph_ctrl_f, tmp_bytes)
+
+        info("#3 Call peaks for each chromosome...")
+        for chrom in self.chromosomes:
+            self.__chrom_call_broadpeak_using_certain_criteria(lvl1peaks,
+                                                               lvl2peaks,
+                                                               chrom,
+                                                               scoring_function_symbols,
+                                                               lvl1_cutoff_s,
+                                                               lvl2_cutoff_s,
+                                                               min_length,
+                                                               lvl1_max_gap,
+                                                               lvl2_max_gap,
+                                                               self.save_bedGraph)
+
+        # close bedGraph file
+        if self.save_bedGraph:
+            fclose(self.bedGraph_treat_f)
+            fclose(self.bedGraph_ctrl_f)
+            # self.bedGraph_ctrl.close()
+            self.save_bedGraph = False
+
+        # now combine lvl1 and lvl2 peaks
+        chrs = lvl1peaks.get_chr_names()
+        broadpeaks = BroadPeakIO()
+        # use lvl2_peaks as linking regions between lvl1_peaks
+        for chrom in sorted(chrs):
+            lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom)
+            lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom)
+            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
+            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
+            # our assumption is lvl1 regions should be included in lvl2 regions
+            try:
+                lvl1 = lvl1peakschrom_next()
+                for i in range(len(lvl2peakschrom)):
+                    # for each lvl2 peak, find all lvl1 peaks inside
+                    # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
+                    lvl2 = lvl2peakschrom[i]
+
+                    while True:
+                        if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
+                            tmppeakset.append(lvl1)
+                            lvl1 = lvl1peakschrom_next()
+                        else:
+                            # make a hierarchical broad peak
+                            # print lvl2["start"], lvl2["end"], lvl2["score"]
+                            self.__add_broadpeak(broadpeaks,
+                                                 chrom,
+                                                 lvl2,
+                                                 tmppeakset)
+                            tmppeakset = []
+                            break
+            except StopIteration:
+                # no more strong (aka lvl1) peaks left
+                self.__add_broadpeak(broadpeaks,
+                                     chrom,
+                                     lvl2,
+                                     tmppeakset)
+                tmppeakset = []
+                # add the rest lvl2 peaks
+                for j in range(i+1, len(lvl2peakschrom)):
+                    self.__add_broadpeak(broadpeaks,
+                                         chrom,
+                                         lvl2peakschrom[j],
+                                         tmppeakset)
+
+        return broadpeaks
+
+    @cython.cfunc
+    def __chrom_call_broadpeak_using_certain_criteria(self,
+                                                      lvl1peaks,
+                                                      lvl2peaks,
+                                                      chrom: bytes,
+                                                      scoring_function_s: list,
+                                                      lvl1_cutoff_s: list,
+                                                      lvl2_cutoff_s: list,
+                                                      min_length: cython.int,
+                                                      lvl1_max_gap: cython.int,
+                                                      lvl2_max_gap: cython.int,
+                                                      save_bedGraph: bool):
+        """Call peaks for a chromosome.
+
+        Combination of criteria is allowed here.
+
+        peaks: a PeakIO object
+
+        scoring_function_s: symbols of functions to calculate score as
+        score=f(x, y) where x is treatment pileup, and y is control
+        pileup
+
+        save_bedGraph : whether or not to save pileup and control into
+        a bedGraph file
+
+        """
+        i: cython.int
+        s: str
+        above_cutoff: cnp.ndarray
+        above_cutoff_endpos: cnp.ndarray
+        above_cutoff_startpos: cnp.ndarray
+        pos_array: cnp.ndarray
+        treat_array: cnp.ndarray
+        ctrl_array: cnp.ndarray
+        above_cutoff_index_array: cnp.ndarray
+        score_array_s: list          # to: list keep different types of scores
+        peak_content: list
+        acs_ptr: cython.pointer(cython.int)
+        ace_ptr: cython.pointer(cython.int)
+        acia_ptr: cython.pointer(cython.int)
+        treat_array_ptr: cython.pointer(cython.float)
+        ctrl_array_ptr: cython.pointer(cython.float)
+
+        assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!"
+        assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!"
+
+        # first, build pileup, self.chr_pos_treat_ctrl
+        self.pileup_treat_ctrl_a_chromosome(chrom)
+        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
+
+        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
+        if save_bedGraph:
+            self.__write_bedGraph_for_a_chromosome(chrom)
+
+        # keep all types of scores needed
+        score_array_s = []
+        for i in range(len(scoring_function_s)):
+            s = scoring_function_s[i]
+            if s == 'p':
+                score_array_s.append(self.__cal_pscore(treat_array,
+                                                       ctrl_array))
+            elif s == 'q':
+                score_array_s.append(self.__cal_qscore(treat_array,
+                                                       ctrl_array))
+            elif s == 'f':
+                score_array_s.append(self.__cal_FE(treat_array,
+                                                   ctrl_array))
+            elif s == 's':
+                score_array_s.append(self.__cal_subtraction(treat_array,
+                                                            ctrl_array))
+
+        # lvl1 : strong peaks
+        peak_content = []           # to store points above cutoff
+
+        # get the regions with scores above cutoffs
+        above_cutoff = np.nonzero(apply_multiple_cutoffs(score_array_s,
+                                                         lvl1_cutoff_s))[0]  # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
+        above_cutoff_index_array = np.arange(pos_array.shape[0],
+                                             dtype="int32")[above_cutoff]  # indices
+        above_cutoff_endpos = pos_array[above_cutoff]  # end positions of regions where score is above cutoff
+        above_cutoff_startpos = pos_array[above_cutoff-1]  # start positions of regions where score is above cutoff
+
+        if above_cutoff.size == 0:
+            # nothing above cutoff
+            return
+
+        if above_cutoff[0] == 0:
+            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
+            above_cutoff_startpos[0] = 0
+
+        # first bit of region above cutoff
+        acs_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_startpos.data)
+        ace_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_endpos.data)
+        acia_ptr = cython.cast(cython.pointer(cython.int),
+                               above_cutoff_index_array.data)
+        treat_array_ptr = cython.cast(cython.pointer(cython.float),
+                                      treat_array.data)
+        ctrl_array_ptr = cython.cast(cython.pointer(cython.float),
+                                     ctrl_array.data)
+
+        ts = acs_ptr[0]
+        te = ace_ptr[0]
+        ti = acia_ptr[0]
+        tp = treat_array_ptr[ti]
+        cp = ctrl_array_ptr[ti]
+
+        peak_content.append((ts, te, tp, cp, ti))
+        acs_ptr += 1            # move ptr
+        ace_ptr += 1
+        acia_ptr += 1
+        lastp = te
+
+        # peak_content.append((above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], score_array_s, above_cutoff_index_array[0]))
+        for i in range(1, above_cutoff_startpos.size):
+            ts = acs_ptr[0]
+            te = ace_ptr[0]
+            ti = acia_ptr[0]
+            acs_ptr += 1
+            ace_ptr += 1
+            acia_ptr += 1
+            tp = treat_array_ptr[ti]
+            cp = ctrl_array_ptr[ti]
+            tl = ts - lastp
+            if tl <= lvl1_max_gap:
+                # append
+                peak_content.append((ts, te, tp, cp, ti))
+                lastp = te
+            else:
+                # close
+                self.__close_peak_for_broad_region(peak_content,
+                                                   lvl1peaks,
+                                                   min_length,
+                                                   chrom,
+                                                   lvl1_max_gap//2,
+                                                   score_array_s)
+                peak_content = [(ts, te, tp, cp, ti),]
+                lastp = te      # above_cutoff_endpos[i]
+
+        # save the last peak
+        if peak_content:
+            self.__close_peak_for_broad_region(peak_content,
+                                               lvl1peaks,
+                                               min_length,
+                                               chrom,
+                                               lvl1_max_gap//2,
+                                               score_array_s)
+
+        # lvl2 : weak peaks
+        peak_content = []           # to store points above cutoff
+
+        # get the regions with scores above cutoffs
+
+        # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
+        above_cutoff = np.nonzero(apply_multiple_cutoffs(score_array_s,
+                                                         lvl2_cutoff_s))[0]
+
+        above_cutoff_index_array = np.arange(pos_array.shape[0],
+                                             dtype="i4")[above_cutoff] # indices
+        above_cutoff_endpos = pos_array[above_cutoff]  # end positions of regions where score is above cutoff
+        above_cutoff_startpos = pos_array[above_cutoff-1]  # start positions of regions where score is above cutoff
+
+        if above_cutoff.size == 0:
+            # nothing above cutoff
+            return
+
+        if above_cutoff[0] == 0:
+            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
+            above_cutoff_startpos[0] = 0
+
+        # first bit of region above cutoff
+        acs_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_startpos.data)
+        ace_ptr = cython.cast(cython.pointer(cython.int),
+                              above_cutoff_endpos.data)
+        acia_ptr = cython.cast(cython.pointer(cython.int),
+                               above_cutoff_index_array.data)
+        treat_array_ptr = cython.cast(cython.pointer(cython.float),
+                                      treat_array.data)
+        ctrl_array_ptr = cython.cast(cython.pointer(cython.float),
+                                     ctrl_array.data)
+
+        ts = acs_ptr[0]
+        te = ace_ptr[0]
+        ti = acia_ptr[0]
+        tp = treat_array_ptr[ti]
+        cp = ctrl_array_ptr[ti]
+        peak_content.append((ts, te, tp, cp, ti))
+        acs_ptr += 1            # move ptr
+        ace_ptr += 1
+        acia_ptr += 1
+
+        lastp = te
+        for i in range(1, above_cutoff_startpos.size):
+            # for everything above cutoff
+            ts = acs_ptr[0]     # get the start
+            te = ace_ptr[0]     # get the end
+            ti = acia_ptr[0]    # get the index
+
+            acs_ptr += 1        # move ptr
+            ace_ptr += 1
+            acia_ptr += 1
+            tp = treat_array_ptr[ti]  # get the treatment pileup
+            cp = ctrl_array_ptr[ti]  # get the control pileup
+            tl = ts - lastp  # get the distance from the current point to last position of existing peak_content
+
+            if tl <= lvl2_max_gap:
+                # append
+                peak_content.append((ts, te, tp, cp, ti))
+                lastp = te
+            else:
+                # close
+                self.__close_peak_for_broad_region(peak_content,
+                                                   lvl2peaks,
+                                                   min_length,
+                                                   chrom,
+                                                   lvl2_max_gap//2,
+                                                   score_array_s)
+
+                peak_content = [(ts, te, tp, cp, ti),]
+                lastp = te
+
+        # save the last peak
+        if peak_content:
+            self.__close_peak_for_broad_region(peak_content,
+                                               lvl2peaks,
+                                               min_length,
+                                               chrom,
+                                               lvl2_max_gap//2,
+                                               score_array_s)
+
+        return
+
+    @cython.cfunc
+    def __close_peak_for_broad_region(self,
+                                      peak_content: list,
+                                      peaks,
+                                      min_length: cython.int,
+                                      chrom: bytes,
+                                      smoothlen: cython.int,
+                                      score_array_s: list,
+                                      score_cutoff_s: list = []) -> bool:
+        """Close the broad peak region, output peak boundaries, peak summit
+        and scores, then add the peak to peakIO object.
+
+        peak_content contains [start, end, treat_p, ctrl_p, list_scores]
+
+        peaks: a BroadPeakIO object
+
+        """
+        tstart: cython.int
+        tend: cython.int
+        i: cython.int
+        ttreat_p: cython.double
+        tctrl_p: cython.double
+        tlist_pileup: list
+        tlist_control: list
+        tlist_length: list
+        tlist_scores_p: cython.int
+        tarray_pileup: cnp.ndarray
+        tarray_control: cnp.ndarray
+        tarray_pscore: cnp.ndarray
+        tarray_qscore: cnp.ndarray
+        tarray_fc: cnp.ndarray
+
+        peak_length = peak_content[-1][1] - peak_content[0][0]
+        if peak_length >= min_length:  # if the peak is too small, reject it
+            tlist_pileup = []
+            tlist_control = []
+            tlist_length = []
+            for i in range(len(peak_content)):  # each position in broad peak
+                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
+                tlist_pileup.append(ttreat_p)
+                tlist_control.append(tctrl_p)
+                tlist_length.append(tend - tstart)
+
+            tarray_pileup = np.array(tlist_pileup, dtype="f4")
+            tarray_control = np.array(tlist_control, dtype="f4")
+            tarray_pscore = self.__cal_pscore(tarray_pileup, tarray_control)
+            tarray_qscore = self.__cal_qscore(tarray_pileup, tarray_control)
+            tarray_fc = self.__cal_FE(tarray_pileup, tarray_control)
+
+            peaks.add(chrom,           # chromosome
+                      peak_content[0][0],  # start
+                      peak_content[-1][1],  # end
+                      summit=0,
+                      peak_score=mean_from_value_length(tarray_qscore, tlist_length),
+                      pileup=mean_from_value_length(tarray_pileup, tlist_length),
+                      pscore=mean_from_value_length(tarray_pscore, tlist_length),
+                      fold_change=mean_from_value_length(tarray_fc, tlist_length),
+                      qscore=mean_from_value_length(tarray_qscore, tlist_length),
+                      )
+            # if chrom == "chr1" and  peak_content[0][0] == 237643 and peak_content[-1][1] == 237935:
+            #    print tarray_qscore, tlist_length
+            # start a new peak
+            return True
+
+    @cython.cfunc
+    def __add_broadpeak(self,
+                        bpeaks,
+                        chrom: bytes,
+                        lvl2peak: object,
+                        lvl1peakset: list):
+        """Internal function to create broad peak.
+
+        *Note* lvl1peakset/strong_regions might be empty
+        """
+
+        blockNum: cython.int
+        start: cython.int
+        end: cython.int
+        blockSizes: bytes
+        blockStarts: bytes
+        thickStart: bytes
+        thickEnd: bytes
+
+        start = lvl2peak["start"]
+        end = lvl2peak["end"]
+
+        if not lvl1peakset:
+            # will complement by adding 1bps start and end to this region
+            # may change in the future if gappedPeak format was improved.
+            bpeaks.add(chrom, start, end,
+                       score=lvl2peak["score"],
+                       thickStart=(b"%d" % start),
+                       thickEnd=(b"%d" % end),
+                       blockNum=2,
+                       blockSizes=b"1,1",
+                       blockStarts=(b"0,%d" % (end-start-1)),
+                       pileup=lvl2peak["pileup"],
+                       pscore=lvl2peak["pscore"],
+                       fold_change=lvl2peak["fc"],
+                       qscore=lvl2peak["qscore"])
+            return bpeaks
+
+        thickStart = b"%d" % (lvl1peakset[0]["start"])
+        thickEnd = b"%d" % (lvl1peakset[-1]["end"])
+        blockNum = len(lvl1peakset)
+        blockSizes = b",".join([b"%d" % y for y in [x["length"] for x in lvl1peakset]])
+        blockStarts = b",".join([b"%d" % x for x in getitem_then_subtract(lvl1peakset, start)])
+
+        # add 1bp left and/or right block if necessary
+        if int(thickStart) != start:
+            # add 1bp left block
+            thickStart = b"%d" % start
+            blockNum += 1
+            blockSizes = b"1,"+blockSizes
+            blockStarts = b"0,"+blockStarts
+        if int(thickEnd) != end:
+            # add 1bp right block
+            thickEnd = b"%d" % end
+            blockNum += 1
+            blockSizes = blockSizes + b",1"
+            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
+
+        bpeaks.add(chrom, start, end,
+                   score=lvl2peak["score"],
+                   thickStart=thickStart,
+                   thickEnd=thickEnd,
+                   blockNum=blockNum,
+                   blockSizes=blockSizes,
+                   blockStarts=blockStarts,
+                   pileup=lvl2peak["pileup"],
+                   pscore=lvl2peak["pscore"],
+                   fold_change=lvl2peak["fc"],
+                   qscore=lvl2peak["qscore"])
+        return bpeaks
diff --git a/MACS3/Signal/CallPeakUnit.pyx b/MACS3/Signal/CallPeakUnit.pyx
deleted file mode 100644
index c6ffb7b8..00000000
--- a/MACS3/Signal/CallPeakUnit.pyx
+++ /dev/null
@@ -1,1781 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# cython: linetrace=True
-# Time-stamp: <2022-09-15 17:06:17 Tao Liu>
-
-"""Module for Calculate Scores.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-
-from collections import Counter
-from copy import copy
-from time import time as ttime
-import _pickle as cPickle
-from tempfile import mkstemp
-import os
-
-import logging
-import MACS3.Utilities.Logger
-
-logger = logging.getLogger(__name__)
-debug   = logger.debug
-info    = logger.info
-# ------------------------------------
-# Other modules
-# ------------------------------------
-import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-from cpython cimport bool
-from cykhash import PyObjectMap, Float32to32Map
-
-# ------------------------------------
-# C lib
-# ------------------------------------
-from libc.stdio cimport *
-from libc.math cimport exp,log,log10, M_LN10, log1p, erf, sqrt, floor, ceil
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
-from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
-from MACS3.Signal.FixWidthTrack import FWTrack
-from MACS3.Signal.PairedEndTrack import PETrackI
-from MACS3.Signal.Prob import poisson_cdf
-# --------------------------------------------
-# cached pscore function and LR_asym functions
-# --------------------------------------------
-pscore_dict = PyObjectMap()
-logLR_dict = PyObjectMap()
-
-cdef float32_t get_pscore ( tuple t ):
-    """t: tuple of ( lambda, observation )
-    """
-    cdef:
-        float32_t val
-    if t in pscore_dict:
-        return pscore_dict[ t ]
-    else:
-        # calculate and cache
-        val = -1.0 * poisson_cdf ( t[0], t[1], False, True )
-        pscore_dict[ t ] = val
-        return val
-
-cdef float32_t get_logLR_asym ( tuple t ):
-    """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
-    chromatin bias ). Set minus sign for depletion.
-    """
-    cdef:
-        float32_t val
-        float32_t x
-        float32_t y
-    if t in logLR_dict:
-        return logLR_dict[ t ]
-    else:
-        x = t[0]
-        y = t[1]
-        # calculate and cache
-        if x > y:
-            val = (x*(log10(x)-log10(y))+y-x)
-        elif x < y:
-            val = (x*(-log10(x)+log10(y))-y+x)
-        else:
-            val = 0
-        logLR_dict[ t ] = val
-        return val
-
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "CallPeakUnit $Revision$"
-__author__ = "Tao Liu <vladimir.liu@gmail.com>"
-__doc__ = "CallPeakUnit"
-
-LOG10_E = 0.43429448190325176
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-
-cdef void clean_up_ndarray ( np.ndarray x ):
-    # clean numpy ndarray in two steps
-    cdef:
-        int64_t i
-    i = x.shape[0] // 2
-    x.resize( 100000 if i > 100000 else i, refcheck=False)
-    x.resize( 0, refcheck=False)
-    return
-
-cdef inline float32_t chi2_k1_cdf ( float32_t x ):
-    return erf( sqrt(x/2) )
-
-cdef inline float32_t log10_chi2_k1_cdf ( float32_t x ):
-  return log10( erf( sqrt(x/2) ) )
-
-cdef inline float32_t chi2_k2_cdf ( float32_t x ):
-  return 1 - exp( -x/2 )
-
-cdef inline float32_t log10_chi2_k2_cdf ( float32_t x ):
-  return log1p( - exp( -x/2 ) ) * LOG10_E
-
-cdef inline float32_t chi2_k4_cdf ( float32_t x ):
-  return 1 - exp( -x/2 ) * ( 1 + x/2 )
-
-cdef inline float32_t log10_chi2_k4_CDF ( float32_t x ):
-  return log1p( - exp( -x/2 ) * ( 1 + x/2 ) ) * LOG10_E
-
-cdef inline np.ndarray apply_multiple_cutoffs ( list multiple_score_arrays, list multiple_cutoffs ):
-    cdef:
-        int32_t i
-        np.ndarray ret
-
-    ret = multiple_score_arrays[0] > multiple_cutoffs[0]
-
-    for i in range(1,len(multiple_score_arrays)):
-        ret += multiple_score_arrays[i] > multiple_cutoffs[i]
-
-    return ret
-
-cdef inline list get_from_multiple_scores ( list multiple_score_arrays, int32_t index ):
-    cdef:
-        list ret = []
-        int32_t i
-
-    for i in range(len(multiple_score_arrays)):
-        ret.append(multiple_score_arrays[i][index])
-    return ret
-
-
-cdef inline float32_t get_logFE ( float32_t x, float32_t y ):
-    """ return 100* log10 fold enrichment with +1 pseudocount.
-    """
-    return log10( x/y )
-
-cdef inline float32_t get_subtraction ( float32_t x, float32_t y):
-    """ return subtraction.
-    """
-    return x - y
-
-cdef inline list getitem_then_subtract ( list peakset, int32_t start ):
-    cdef:
-        list a
-
-    a = [x["start"] for x in peakset]
-    for i in range(len(a)):
-        a[i] = a[i] - start
-    return a
-
-cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
-    """
-    """
-    return sum([data[x] for x in data if x <= pos and x >= pos - width])
-
-cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
-    """
-    """
-    return sum([data[x] for x in data if x >= pos and x <= pos + width])
-
-cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
-    return data.get(pos,0) - data.get(pos-window_size, 0)
-
-cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
-    return data.get(pos + window_size, 0) - data.get(pos, 0)
-
-cdef float32_t median_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
-    """
-    """
-    cdef:
-        list tmp
-        int32_t c, tmp_l
-        float32_t tmp_v, mid_l
-
-    c = 0
-    tmp = sorted(list(zip( value, length )))
-    mid_l = sum( length )/2
-    for (tmp_v, tmp_l) in tmp:
-        c += tmp_l
-        if c > mid_l:
-            return tmp_v
-
-cdef float32_t mean_from_value_length ( np.ndarray[np.float32_t, ndim=1] value, list length ):
-    """take list of values and list of corresponding lengths, calculate the mean.
-    An important function for bedGraph type of data.
-    """
-    cdef:
-        int32_t i
-        int32_t tmp_l, l
-        float64_t tmp_v, sum_v, tmp_sum   #try to solve precision issue
-        float32_t ret
-
-    sum_v = 0
-    l = 0
-
-    for i in range( len(length) ):
-        tmp_l = length[ i ]
-        tmp_v = <float64_t>value[ i ]
-        tmp_sum = tmp_v * tmp_l
-        sum_v = tmp_sum + sum_v
-        l += tmp_l
-
-    ret = <float32_t>(sum_v/l)
-
-    return ret
-
-
-cdef tuple find_optimal_cutoff( list x, list y ):
-    """Return the best cutoff x and y.
-
-    We assume that total peak length increase exponentially while
-    decreasing cutoff value. But while cutoff decreases to a point
-    that background noises are captured, total length increases much
-    faster. So we fit a linear model by taking the first 10 points,
-    then look for the largest cutoff that
-
-    """
-    cdef:
-        np.ndarray npx, npy, npA
-        float32_t optimal_x, optimal_y
-        int64_t l, i
-        float32_t m, c # slop and intercept
-        float32_t sst # sum of squared total
-        float32_t sse # sum of squared error
-        float32_t rsq # R-squared
-
-    l = len(x)
-    assert l == len(y)
-    npx = np.array( x )
-    npy = np.log10( np.array( y ) )
-    npA = np.vstack( [npx, np.ones(len(npx))] ).T
-
-    for i in range( 10, l ):
-        # at least the largest 10 points
-        m, c = np.linalg.lstsq( npA[:i], npy[:i], rcond=None )[ 0 ]
-        sst = sum( ( npy[:i] - np.mean( npy[:i] ) ) ** 2 )
-        sse = sum( ( npy[:i] - m*npx[:i] - c ) ** 2 )
-        rsq = 1 - sse/sst
-        #print i, x[i], y[i], m, c, rsq
-    return ( 1.0, 1.0 )
-
-
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-cdef class CallerFromAlignments:
-    """A unit to calculate scores and call peaks from alignments --
-    FWTrack or PETrack objects.
-
-    It will compute for each chromosome separately in order to save
-    memory usage.
-    """
-    cdef:
-        object treat            # FWTrack or PETrackI object for ChIP
-        object ctrl             # FWTrack or PETrackI object for Control
-
-        int32_t  d                           # extension size for ChIP
-        list ctrl_d_s                    # extension sizes for Control. Can be multiple values
-        float32_t treat_scaling_factor       # scaling factor for ChIP
-        list ctrl_scaling_factor_s       # scaling factor for Control, corresponding to each extension size.
-        float32_t lambda_bg                  # minimum local bias to fill missing values
-        list chromosomes                 # name of common chromosomes in ChIP and Control data
-        float64_t pseudocount                # the pseudocount used to calcuate logLR, FE or logFE
-        bytes bedGraph_filename_prefix     # prefix will be added to _pileup.bdg for treatment and _lambda.bdg for control
-
-        int32_t  end_shift                   # shift of cutting ends before extension
-        bool trackline                   # whether trackline should be saved in bedGraph
-        bool save_bedGraph               # whether to save pileup and local bias in bedGraph files
-        bool save_SPMR                   # whether to save pileup normalized by sequencing depth in million reads
-        bool no_lambda_flag              # whether ignore local bias, and to use global bias instead
-        bool PE_mode                     # whether it's in PE mode, will be detected during initiation
-
-        # temporary data buffer
-        list chr_pos_treat_ctrl          # temporary [position, treat_pileup, ctrl_pileup] for a given chromosome
-        bytes bedGraph_treat_filename
-        bytes bedGraph_control_filename
-        FILE * bedGraph_treat_f
-        FILE * bedGraph_ctrl_f
-
-        # data needed to be pre-computed before peak calling
-        object pqtable          # remember pvalue->qvalue convertion; saved in cykhash Float32to32Map
-        bool pvalue_all_done             # whether the pvalue of whole genome is all calculated. If yes, it's OK to calculate q-value.
-
-        dict pvalue_npeaks               # record for each pvalue cutoff, how many peaks can be called
-        dict pvalue_length               # record for each pvalue cutoff, the total length of called peaks
-        float32_t optimal_p_cutoff           # automatically decide the p-value cutoff ( can be translated into qvalue cutoff ) based
-                                         # on p-value to total peak length analysis.
-        bytes cutoff_analysis_filename     # file to save the pvalue-npeaks-totallength table
-
-        dict pileup_data_files           # Record the names of temporary files for storing pileup values of each chromosome
-
-
-    def __init__ (self, treat, ctrl,
-                  int32_t d = 200, list ctrl_d_s = [200, 1000, 10000],
-                  float32_t treat_scaling_factor = 1.0, list ctrl_scaling_factor_s = [1.0, 0.2, 0.02],
-                  bool stderr_on = False,
-                  float32_t pseudocount = 1,
-                  int32_t end_shift = 0,
-                  float32_t lambda_bg = 0,
-                  bool save_bedGraph = False,
-                  str  bedGraph_filename_prefix = "PREFIX",
-                  str bedGraph_treat_filename = "TREAT.bdg",
-                  str bedGraph_control_filename = "CTRL.bdg",
-                  str cutoff_analysis_filename = "TMP.txt",
-                  bool save_SPMR = False ):
-        """Initialize.
-
-        A calculator is unique to each comparison of treat and
-        control. Treat_depth and ctrl_depth should not be changed
-        during calculation.
-
-        treat and ctrl are either FWTrack or PETrackI objects.
-
-        treat_depth and ctrl_depth are effective depth in million:
-                                    sequencing depth in million after
-                                    duplicates being filtered. If
-                                    treatment is scaled down to
-                                    control sample size, then this
-                                    should be control sample size in
-                                    million. And vice versa.
-
-        d, sregion, lregion: d is the fragment size, sregion is the
-                             small region size, lregion is the large
-                             region size
-
-        pseudocount: a pseudocount used to calculate logLR, FE or
-                     logFE. Please note this value will not be changed
-                     with normalization method. So if you really want
-                     to set pseudocount 1 per million reads, set it
-                     after you normalize treat and control by million
-                     reads by `change_normalizetion_method(ord('M'))`.
-
-        """
-        cdef:
-            set chr1, chr2
-            int32_t i
-            char * tmp
-            bytes tmp_bytes
-            float32_t p
-        # decide PE mode
-        if isinstance(treat, FWTrack):
-            self.PE_mode = False
-        elif isinstance(treat, PETrackI):
-            self.PE_mode = True
-        else:
-            raise Exception("Should be FWTrack or PETrackI object!")
-        # decide if there is control
-        self.treat = treat
-        if ctrl:
-            self.ctrl = ctrl
-        else:                   # while there is no control
-            self.ctrl = treat
-        self.trackline = False
-        self.d = d              # note, self.d doesn't make sense in PE mode
-        self.ctrl_d_s = ctrl_d_s# note, self.d doesn't make sense in PE mode
-        self.treat_scaling_factor = treat_scaling_factor
-        self.ctrl_scaling_factor_s= ctrl_scaling_factor_s
-        self.end_shift = end_shift
-        self.lambda_bg = lambda_bg
-        self.pqtable = Float32to32Map( for_int = False ) # Float32 -> Float32 map
-        self.save_bedGraph = save_bedGraph
-        self.save_SPMR = save_SPMR
-        self.bedGraph_filename_prefix = bedGraph_filename_prefix.encode()
-        self.bedGraph_treat_filename = bedGraph_treat_filename.encode()
-        self.bedGraph_control_filename = bedGraph_control_filename.encode()
-        if not self.ctrl_d_s or not self.ctrl_scaling_factor_s:
-            self.no_lambda_flag = True
-        else:
-            self.no_lambda_flag = False
-        self.pseudocount = pseudocount
-        # get the common chromosome names from both treatment and control
-        chr1 = set(self.treat.get_chr_names())
-        chr2 = set(self.ctrl.get_chr_names())
-        self.chromosomes = sorted(list(chr1.intersection(chr2)))
-
-        self.pileup_data_files = {}
-        self.pvalue_length = {}
-        self.pvalue_npeaks = {}
-        for p in np.arange( 0.3, 10, 0.3 ): # step for optimal cutoff is 0.3 in -log10pvalue, we try from pvalue 1E-10 (-10logp=10) to 0.5 (-10logp=0.3)
-            self.pvalue_length[ p ] = 0
-            self.pvalue_npeaks[ p ] = 0
-        self.optimal_p_cutoff = 0
-        self.cutoff_analysis_filename = cutoff_analysis_filename.encode()
-
-    cpdef destroy ( self ):
-        """Remove temporary files for pileup values of each chromosome.
-
-        Note: This function MUST be called if the class object won't
-        be used anymore.
-
-        """
-        cdef:
-            bytes f
-
-        for f in self.pileup_data_files.values():
-            if os.path.isfile( f ):
-                os.unlink( f )
-        return
-
-    cpdef set_pseudocount( self, float32_t pseudocount ):
-        self.pseudocount = pseudocount
-
-    cpdef enable_trackline( self ):
-        """Turn on trackline with bedgraph output
-        """
-        self.trackline = True
-
-    cdef __pileup_treat_ctrl_a_chromosome ( self, bytes chrom ):
-        """After this function is called, self.chr_pos_treat_ctrl will
-        be reset and assigned to the pileup values of the given
-        chromosome.
-
-        """
-        cdef:
-            list treat_pv, ctrl_pv
-            int64_t i
-            float32_t t
-            object f
-            str temp_filename
-
-        assert chrom in self.chromosomes, "chromosome %s is not valid." % chrom
-
-        # check backup file of pileup values. If not exists, create
-        # it. Otherwise, load them instead of calculating new pileup
-        # values.
-        if chrom in self.pileup_data_files:
-            try:
-                f = open( self.pileup_data_files[ chrom ],"rb" )
-                self.chr_pos_treat_ctrl = cPickle.load( f )
-                f.close()
-                return
-            except:
-                temp_fd, temp_filename = mkstemp()
-                os.close(temp_fd)
-                self.pileup_data_files[ chrom ] = temp_filename
-        else:
-            temp_fd, temp_filename = mkstemp()
-            os.close(temp_fd)
-            self.pileup_data_files[ chrom ] = temp_filename.encode()
-
-        # reset or clean existing self.chr_pos_treat_ctrl
-        if self.chr_pos_treat_ctrl:     # not a beautiful way to clean
-            clean_up_ndarray( self.chr_pos_treat_ctrl[0] )
-            clean_up_ndarray( self.chr_pos_treat_ctrl[1] )
-            clean_up_ndarray( self.chr_pos_treat_ctrl[2] )
-
-        if self.PE_mode:
-            treat_pv = self.treat.pileup_a_chromosome ( chrom, [self.treat_scaling_factor,], baseline_value = 0.0 )
-        else:
-            treat_pv = self.treat.pileup_a_chromosome( chrom, [self.d,], [self.treat_scaling_factor,], baseline_value = 0.0,
-                                                       directional = True,
-                                                       end_shift = self.end_shift )
-
-        if not self.no_lambda_flag:
-            if self.PE_mode:
-                # note, we pileup up PE control as SE control because
-                # we assume the bias only can be captured at the
-                # surrounding regions of cutting sites from control experiments.
-                ctrl_pv = self.ctrl.pileup_a_chromosome_c( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s, baseline_value = self.lambda_bg )
-            else:
-                ctrl_pv = self.ctrl.pileup_a_chromosome( chrom, self.ctrl_d_s, self.ctrl_scaling_factor_s,
-                                                         baseline_value = self.lambda_bg,
-                                                         directional = False )
-        else:
-            ctrl_pv = [treat_pv[0][-1:], np.array([self.lambda_bg,], dtype="float32")] # set a global lambda
-
-        self.chr_pos_treat_ctrl = self.__chrom_pair_treat_ctrl( treat_pv, ctrl_pv)
-
-        # clean treat_pv and ctrl_pv
-        treat_pv = []
-        ctrl_pv  = []
-
-        # save data to temporary file
-        try:
-            f = open(self.pileup_data_files[ chrom ],"wb")
-            cPickle.dump( self.chr_pos_treat_ctrl, f , protocol=2 )
-            f.close()
-        except:
-            # fail to write then remove the key in pileup_data_files
-            self.pileup_data_files.pop(chrom)
-        return
-
-    cdef list __chrom_pair_treat_ctrl ( self, treat_pv, ctrl_pv ):
-        """*private* Pair treat and ctrl pileup for each region.
-
-        treat_pv and ctrl_pv are [np.ndarray, np.ndarray].
-
-        return [p, t, c] list, each element is a numpy array.
-        """
-        cdef:
-            list ret
-            int64_t pre_p, index_ret, it, ic, lt, lc
-            np.ndarray[np.int32_t, ndim=1] t_p, c_p, ret_p
-            np.ndarray[np.float32_t, ndim=1] t_v, c_v, ret_t, ret_c
-
-            int32_t * t_p_ptr
-            int32_t * c_p_ptr
-            int32_t * ret_p_ptr
-
-            float32_t * t_v_ptr
-            float32_t * c_v_ptr
-            float32_t * ret_t_ptr
-            float32_t * ret_c_ptr
-
-        [ t_p, t_v ] = treat_pv
-        [ c_p, c_v ] = ctrl_pv
-
-        lt = t_p.shape[0]
-        lc = c_p.shape[0]
-
-        chrom_max_len = lt + lc
-
-        ret_p = np.zeros( chrom_max_len, dtype="int32" ) # position
-        ret_t = np.zeros( chrom_max_len, dtype="float32" ) # value from treatment
-        ret_c = np.zeros( chrom_max_len, dtype="float32" ) # value from control
-
-        t_p_ptr = <int32_t *> t_p.data
-        t_v_ptr = <float32_t *> t_v.data
-        c_p_ptr = <int32_t *> c_p.data
-        c_v_ptr = <float32_t *> c_v.data
-        ret_p_ptr = <int32_t *> ret_p.data
-        ret_t_ptr = <float32_t *>ret_t.data
-        ret_c_ptr = <float32_t *>ret_c.data
-
-        pre_p = 0
-        index_ret = 0
-        it = 0
-        ic = 0
-
-        while it < lt and ic < lc:
-            if t_p_ptr[0] < c_p_ptr[0]:
-                # clip a region from pre_p to p1, then set pre_p as p1.
-                ret_p_ptr[0] = t_p_ptr[0]
-                ret_t_ptr[0] = t_v_ptr[0]
-                ret_c_ptr[0] = c_v_ptr[0]
-                ret_p_ptr += 1
-                ret_t_ptr += 1
-                ret_c_ptr += 1
-                pre_p = t_p_ptr[0]
-                index_ret += 1
-                # call for the next p1 and v1
-                it += 1
-                t_p_ptr += 1
-                t_v_ptr += 1
-            elif t_p_ptr[0] > c_p_ptr[0]:
-                # clip a region from pre_p to p2, then set pre_p as p2.
-                ret_p_ptr[0] = c_p_ptr[0]
-                ret_t_ptr[0] = t_v_ptr[0]
-                ret_c_ptr[0] = c_v_ptr[0]
-                ret_p_ptr += 1
-                ret_t_ptr += 1
-                ret_c_ptr += 1
-                pre_p = c_p_ptr[0]
-                index_ret += 1
-                # call for the next p2 and v2
-                ic += 1
-                c_p_ptr += 1
-                c_v_ptr += 1
-            else:
-                # from pre_p to p1 or p2, then set pre_p as p1 or p2.
-                ret_p_ptr[0] = t_p_ptr[0]
-                ret_t_ptr[0] = t_v_ptr[0]
-                ret_c_ptr[0] = c_v_ptr[0]
-                ret_p_ptr += 1
-                ret_t_ptr += 1
-                ret_c_ptr += 1
-                pre_p = t_p_ptr[0]
-                index_ret += 1
-                # call for the next p1, v1, p2, v2.
-                it += 1
-                ic += 1
-                t_p_ptr += 1
-                t_v_ptr += 1
-                c_p_ptr += 1
-                c_v_ptr += 1
-
-        ret_p.resize( index_ret, refcheck=False)
-        ret_t.resize( index_ret, refcheck=False)
-        ret_c.resize( index_ret, refcheck=False)
-        return [ret_p, ret_t, ret_c]
-
-    cdef np.ndarray __cal_score ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2, cal_func ):
-        cdef:
-            int64_t i
-            np.ndarray[np.float32_t, ndim=1] s
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-        for i in range(array1.shape[0]):
-            s[i] = cal_func( array1[i], array2[i] )
-        return s
-
-    cdef void __cal_pvalue_qvalue_table ( self ):
-        """After this function is called, self.pqtable is built. All
-        chromosomes will be iterated. So it will take some time.
-
-        """
-        cdef:
-            bytes chrom
-            np.ndarray pos_array, treat_array, ctrl_array, score_array
-            dict pscore_stat
-            int64_t n, pre_p, length, pre_l, l, i, j
-            float32_t this_v, pre_v, v, q, pre_q
-            int64_t N, k, this_l
-            float32_t f
-            list unique_values
-            int32_t * pos_ptr
-            float32_t * treat_value_ptr
-            float32_t * ctrl_value_ptr
-
-        debug ( "Start to calculate pvalue stat..." )
-
-        pscore_stat = {} #dict()
-        for i in range( len( self.chromosomes ) ):
-            chrom = self.chromosomes[ i ]
-            pre_p = 0
-
-            self.__pileup_treat_ctrl_a_chromosome( chrom )
-            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
-
-            pos_ptr = <int32_t *> pos_array.data
-            treat_value_ptr = <float32_t *> treat_array.data
-            ctrl_value_ptr = <float32_t *> ctrl_array.data
-
-            for j in range(pos_array.shape[0]):
-                this_v = get_pscore( (<int32_t>(treat_value_ptr[0]), ctrl_value_ptr[0] ) )
-                this_l = pos_ptr[0] - pre_p
-                if this_v in pscore_stat:
-                    pscore_stat[ this_v ] += this_l
-                else:
-                    pscore_stat[ this_v ] = this_l
-                pre_p = pos_ptr[0]
-                pos_ptr += 1
-                treat_value_ptr += 1
-                ctrl_value_ptr += 1
-
-        N = sum(pscore_stat.values()) # total length
-        k = 1                         # rank
-        f = -log10(N)
-        pre_v = -2147483647
-        pre_l = 0
-        pre_q = 2147483647      # save the previous q-value
-
-        self.pqtable = Float32to32Map( for_int = False )
-        unique_values = sorted(list(pscore_stat.keys()), reverse=True) 
-        for i in range(len(unique_values)):
-            v = unique_values[i]
-            l = pscore_stat[v]
-            q = v + (log10(k) + f)
-            if q > pre_q:
-                q = pre_q
-            if q <= 0:
-                q = 0
-                break
-            #q = max(0,min(pre_q,q))           # make q-score monotonic
-            self.pqtable[ v ] = q
-            pre_q = q
-            k += l
-        # bottom rank pscores all have qscores 0
-        for j in range(i, len(unique_values) ):
-            v = unique_values[ j ]
-            self.pqtable[ v ] = 0
-        return
-
-    cdef void __pre_computes ( self, int32_t max_gap = 50, int32_t min_length = 200 ):
-        """After this function is called, self.pqtable and self.pvalue_length is built. All
-        chromosomes will be iterated. So it will take some time.
-
-        """
-        cdef:
-            bytes chrom
-            np.ndarray pos_array, treat_array, ctrl_array, score_array
-            dict pscore_stat
-            int64_t n, pre_p, this_p, length, j, pre_l, l, i
-            float32_t q, pre_q, this_t, this_c
-            float32_t this_v, pre_v, v, cutoff
-            int64_t N, k, this_l
-            float32_t f
-            list unique_values
-            float64_t t0, t1, t
-
-            np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
-            list peak_content
-            int64_t peak_length, total_l, total_p
-
-            list tmplist
-
-            int32_t * acs_ptr   # above cutoff start position pointer
-            int32_t * ace_ptr   # above cutoff end position pointer
-            int32_t * pos_array_ptr # position array pointer
-            float32_t * score_array_ptr # score array pointer
-
-        debug ( "Start to calculate pvalue stat..." )
-
-        # tmplist contains a list of log pvalue cutoffs from 0.3 to 10
-        tmplist = [round(x,5) for x in sorted( list(np.arange(0.3, 10.0, 0.3)), reverse = True )]
-
-        pscore_stat = {} #dict()
-        #print (list(pscore_stat.keys()))
-        #print (list(self.pvalue_length.keys()))
-        #print (list(self.pvalue_npeaks.keys()))
-        for i in range( len( self.chromosomes ) ):
-            chrom = self.chromosomes[ i ]
-            self.__pileup_treat_ctrl_a_chromosome( chrom )
-            [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
-
-            score_array = self.__cal_pscore( treat_array, ctrl_array )
-
-            for n in range( len( tmplist ) ):
-                cutoff = tmplist[ n ]
-                total_l = 0           # total length in potential peak
-                total_p = 0
-
-                # get the regions with scores above cutoffs
-                above_cutoff = np.nonzero( score_array > cutoff )[0]# this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-                above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-                above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
-
-                if above_cutoff_endpos.size == 0:
-                    continue
-
-                # first bit of region above cutoff
-                acs_ptr = <int32_t *> above_cutoff_startpos.data
-                ace_ptr = <int32_t *> above_cutoff_endpos.data
-
-                peak_content = [( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
-                lastp = ace_ptr[ 0 ]
-                acs_ptr += 1
-                ace_ptr += 1
-
-                for i in range( 1, above_cutoff_startpos.size ):
-                    tl = acs_ptr[ 0 ] - lastp
-                    if tl <= max_gap:
-                        peak_content.append( ( acs_ptr[ 0 ], ace_ptr[ 0 ] ) )
-                    else:
-                        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                        if peak_length >= min_length: # if the peak is too small, reject it
-                            total_l +=  peak_length
-                            total_p += 1
-                        peak_content = [ ( acs_ptr[ 0 ], ace_ptr[ 0 ] ), ]
-                    lastp = ace_ptr[ 0 ]
-                    acs_ptr += 1
-                    ace_ptr += 1
-
-                if peak_content:
-                    peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-                    if peak_length >= min_length: # if the peak is too small, reject it
-                        total_l +=  peak_length
-                        total_p += 1
-                self.pvalue_length[ cutoff ] = self.pvalue_length.get( cutoff, 0 ) + total_l
-                self.pvalue_npeaks[ cutoff ] = self.pvalue_npeaks.get( cutoff, 0 ) + total_p
-
-            pos_array_ptr = <int32_t *> pos_array.data
-            score_array_ptr = <float32_t *> score_array.data
-
-            pre_p = 0
-            for i in range(pos_array.shape[0]):
-                this_p = pos_array_ptr[ 0 ]
-                this_l = this_p - pre_p
-                this_v = score_array_ptr[ 0 ]
-                if this_v in pscore_stat:
-                    pscore_stat[ this_v ] += this_l
-                else:
-                    pscore_stat[ this_v ] = this_l
-                pre_p = this_p #pos_array[ i ]
-                pos_array_ptr += 1
-                score_array_ptr += 1
-
-        #debug ( "make pscore_stat cost %.5f seconds" % t )
-
-        # add all pvalue cutoffs from cutoff-analysis part. So that we
-        # can get the corresponding qvalues for them.
-        for cutoff in tmplist:
-            if cutoff not in pscore_stat:
-                pscore_stat[ cutoff ] = 0
-
-        nhval = 0
-
-        N = sum(pscore_stat.values()) # total length
-        k = 1                           # rank
-        f = -log10(N)
-        pre_v = -2147483647
-        pre_l = 0
-        pre_q = 2147483647              # save the previous q-value
-
-        self.pqtable = Float32to32Map( for_int = False ) #{}
-        unique_values = sorted(list(pscore_stat.keys()), reverse=True) #sorted(unique_values,reverse=True)
-        for i in range(len(unique_values)):
-            v = unique_values[i]
-            l = pscore_stat[v]
-            q = v + (log10(k) + f)
-            if q > pre_q:
-                q = pre_q
-            if q <= 0:
-                q = 0
-                break
-            #q = max(0,min(pre_q,q))           # make q-score monotonic
-            self.pqtable[ v ] = q
-            pre_v = v
-            pre_q = q
-            k+=l
-        for j in range(i, len(unique_values) ):
-            v = unique_values[ j ]
-            self.pqtable[ v ] = 0
-
-        # write pvalue and total length of predicted peaks
-        # this is the output from cutoff-analysis
-        fhd = open( self.cutoff_analysis_filename, "w" )
-        fhd.write( "pscore\tqscore\tnpeaks\tlpeaks\tavelpeak\n" )
-        x = []
-        y = []
-        for cutoff in tmplist:
-            if self.pvalue_npeaks[ cutoff ] > 0:
-                fhd.write( "%.2f\t%.2f\t%d\t%d\t%.2f\n" % ( cutoff, self.pqtable[ cutoff ], self.pvalue_npeaks[ cutoff ], self.pvalue_length[ cutoff ], self.pvalue_length[ cutoff ]/self.pvalue_npeaks[ cutoff ] ) )
-                x.append( cutoff )
-                y.append( self.pvalue_length[ cutoff ] )
-        fhd.close()
-        info( "#3 Analysis of cutoff vs num of peaks or total length has been saved in %s" % self.cutoff_analysis_filename )
-        #info( "#3 Suggest a cutoff..." )
-        #optimal_cutoff, optimal_length = find_optimal_cutoff( x, y )
-        #info( "#3 -10log10pvalue cutoff %.2f will call approximately %.0f bps regions as significant regions" % ( optimal_cutoff, optimal_length ) )
-        #print (list(pqtable.keys()))
-        #print (list(self.pvalue_length.keys()))
-        #print (list(self.pvalue_npeaks.keys()))
-        return
-
-    cpdef call_peaks ( self, list scoring_function_symbols, list score_cutoff_s, int32_t min_length = 200,
-                       int32_t max_gap = 50, bool call_summits = False, bool cutoff_analysis = False ):
-        """Call peaks for all chromosomes. Return a PeakIO object.
-
-        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
-        score_cutoff_s    : cutoff values corresponding to scoring functions
-        min_length        : minimum length of peak
-        max_gap           : maximum gap of 'insignificant' regions within a peak. Note, for PE_mode, max_gap and max_length are both set as fragment length.
-        call_summits      : boolean. Whether or not call sub-peaks.
-        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
-        """
-        cdef:
-            bytes chrom
-            bytes tmp_bytes
-
-        peaks = PeakIO()
-
-        # prepare p-q table
-        if len( self.pqtable ) == 0:
-            info("#3 Pre-compute pvalue-qvalue table...")
-            if cutoff_analysis:
-                info("#3 Cutoff vs peaks called will be analyzed!")
-                self.__pre_computes( max_gap = max_gap, min_length = min_length )
-            else:
-                self.__cal_pvalue_qvalue_table()
-
-
-        # prepare bedGraph file
-        if self.save_bedGraph:
-            self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
-            self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )
-
-            info ("#3 In the peak calling step, the following will be performed simultaneously:")
-            info ("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
-            info ("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
-
-            if self.save_SPMR:
-                info ( "#3   --SPMR is requested, so pileup will be normalized by sequencing depth in million reads." )
-            elif self.treat_scaling_factor == 1:
-                info ( "#3   Pileup will be based on sequencing depth in treatment." )
-            else:
-                info ( "#3   Pileup will be based on sequencing depth in control." )
-
-            if self.trackline:
-                # this line is REQUIRED by the wiggle format for UCSC browser
-                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
-                fprintf( self.bedGraph_treat_f, tmp_bytes )
-                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
-                fprintf( self.bedGraph_ctrl_f, tmp_bytes )
-
-        info("#3 Call peaks for each chromosome...")
-        for chrom in self.chromosomes:
-            # treat/control bedGraph will be saved if requested by user.
-            self.__chrom_call_peak_using_certain_criteria ( peaks, chrom, scoring_function_symbols, score_cutoff_s, min_length, max_gap, call_summits, self.save_bedGraph )
-
-        # close bedGraph file
-        if self.save_bedGraph:
-            fclose(self.bedGraph_treat_f)
-            fclose(self.bedGraph_ctrl_f)
-            self.save_bedGraph = False
-
-        return peaks
-
-    cdef void __chrom_call_peak_using_certain_criteria ( self, peaks, bytes chrom, list scoring_function_s, list score_cutoff_s, int32_t min_length,
-                                                   int32_t max_gap, bool call_summits, bool save_bedGraph ):
-        """ Call peaks for a chromosome.
-
-        Combination of criteria is allowed here.
-
-        peaks: a PeakIO object, the return value of this function
-        scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
-        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
-        """
-        cdef:
-            float64_t t0
-            int32_t i, n
-            str s
-            np.ndarray above_cutoff
-            np.ndarray[np.int32_t, ndim=1] above_cutoff_endpos, above_cutoff_startpos, pos_array, above_cutoff_index_array
-
-            np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
-            list score_array_s          # list to keep different types of scores
-            list peak_content           #  to store information for a
-                                        #  chunk in a peak region, it
-                                        #  contains lists of: 1. left
-                                        #  position; 2. right
-                                        #  position; 3. treatment
-                                        #  value; 4. control value;
-                                        #  5. list of scores at this
-                                        #  chunk
-            int64_t tl, lastp, ts, te, ti
-            float32_t tp, cp
-            int32_t * acs_ptr
-            int32_t * ace_ptr
-            int32_t * acia_ptr
-            float32_t * treat_array_ptr
-            float32_t * ctrl_array_ptr
-
-
-        assert len(scoring_function_s) == len(score_cutoff_s), "number of functions and cutoffs should be the same!"
-
-        peak_content = []           # to store points above cutoff
-
-        # first, build pileup, self.chr_pos_treat_ctrl
-        # this step will be speeped up if pqtable is pre-computed.
-        self.__pileup_treat_ctrl_a_chromosome( chrom )
-        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
-
-        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
-        if save_bedGraph:
-            self.__write_bedGraph_for_a_chromosome ( chrom )
-
-        # keep all types of scores needed
-        #t0 = ttime()
-        score_array_s = []
-        for i in range(len(scoring_function_s)):
-            s = scoring_function_s[i]
-            if s == 'p':
-                score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
-            elif s == 'q':
-                score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
-            elif s == 'f':
-                score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
-            elif s == 's':
-                score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )
-
-        # get the regions with scores above cutoffs
-        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,score_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
-        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
-
-        if above_cutoff.size == 0:
-            # nothing above cutoff
-            return
-
-        if above_cutoff[0] == 0:
-            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
-            above_cutoff_startpos[0] = 0
-
-        #print "apply cutoff -- chrom:",chrom,"  time:", ttime() - t0
-        # start to build peak regions
-        #t0 = ttime()
-
-        # first bit of region above cutoff
-        acs_ptr = <int32_t *>above_cutoff_startpos.data
-        ace_ptr = <int32_t *>above_cutoff_endpos.data
-        acia_ptr= <int32_t *>above_cutoff_index_array.data
-        treat_array_ptr = <float32_t *> treat_array.data
-        ctrl_array_ptr = <float32_t *> ctrl_array.data
-
-        ts = acs_ptr[ 0 ]
-        te = ace_ptr[ 0 ]
-        ti = acia_ptr[ 0 ]
-        tp = treat_array_ptr[ ti ]
-        cp = ctrl_array_ptr[ ti ]
-
-        peak_content.append( ( ts, te, tp, cp, ti ) )
-        lastp = te
-        acs_ptr += 1
-        ace_ptr += 1
-        acia_ptr+= 1
-
-        for i in range( 1, above_cutoff_startpos.shape[0] ):
-            ts = acs_ptr[ 0 ]
-            te = ace_ptr[ 0 ]
-            ti = acia_ptr[ 0 ]
-            acs_ptr += 1
-            ace_ptr += 1
-            acia_ptr+= 1
-            tp = treat_array_ptr[ ti ]
-            cp = ctrl_array_ptr[ ti ]
-            tl = ts - lastp
-            if tl <= max_gap:
-                # append.
-                peak_content.append( ( ts, te, tp, cp, ti ) )
-                lastp = te #above_cutoff_endpos[i]
-            else:
-                # close
-                if call_summits:
-                    self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
-                else:
-                    self.__close_peak_wo_subpeaks   (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
-                peak_content = [ ( ts, te, tp, cp, ti ), ]
-                lastp = te #above_cutoff_endpos[i]
-        # save the last peak
-        if not peak_content:
-            return
-        else:
-            if call_summits:
-                self.__close_peak_with_subpeaks (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
-            else:
-                self.__close_peak_wo_subpeaks   (peak_content, peaks, min_length, chrom, min_length, score_array_s, score_cutoff_s = score_cutoff_s ) # smooth length is min_length, i.e. fragment size 'd'
-
-        #print "close peaks -- chrom:",chrom,"  time:", ttime() - t0
-        return
-
-    cdef bool __close_peak_wo_subpeaks (self, list peak_content, peaks, int32_t min_length,
-                                          bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
-        """Close the peak region, output peak boundaries, peak summit
-        and scores, then add the peak to peakIO object.
-
-        peak_content contains [start, end, treat_p, ctrl_p, index_in_score_array]
-
-        peaks: a PeakIO object
-
-        """
-        cdef:
-            int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
-            float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
-            int32_t tlist_scores_p
-
-        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-        if peak_length >= min_length: # if the peak is too small, reject it
-            tsummit = []
-            summit_pos   = 0
-            summit_value = 0
-            for i in range(len(peak_content)):
-                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
-                tscore = ttreat_p # use pscore as general score to find summit
-                if not summit_value or summit_value < tscore:
-                    tsummit = [(tend + tstart) // 2, ]
-                    tsummit_index = [ i, ]
-                    summit_value = tscore
-                elif summit_value == tscore:
-                    # remember continuous summit values
-                    tsummit.append((tend + tstart) // 2)
-                    tsummit_index.append( i )
-            # the middle of all highest points in peak region is defined as summit
-            midindex = (len(tsummit) + 1) // 2 - 1
-            summit_pos    = tsummit[ midindex ]
-            summit_index  = tsummit_index[ midindex ]
-
-            summit_treat = peak_content[ summit_index ][ 2 ]
-            summit_ctrl = peak_content[ summit_index ][ 3 ]
-
-            # this is a double-check to see if the summit can pass cutoff values.
-            for i in range(len(score_cutoff_s)):
-                if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
-                    return False # not passed, then disgard this peak.
-
-            summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] #get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
-            summit_q_score = self.pqtable[ summit_p_score ]
-
-            peaks.add( chrom,           # chromosome
-                       peak_content[0][0], # start
-                       peak_content[-1][1], # end
-                       summit      = summit_pos, # summit position
-                       peak_score  = summit_q_score, # score at summit
-                       pileup      = summit_treat, # pileup
-                       pscore      = summit_p_score, # pvalue
-                       fold_change = ( summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
-                       qscore      = summit_q_score # qvalue
-                       )
-            # start a new peak
-            return True
-
-    cdef bool __close_peak_with_subpeaks (self, list peak_content, peaks, int32_t min_length,
-                                         bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[],
-                                         float32_t min_valley = 0.9 ):
-        """Algorithm implemented by Ben, to profile the pileup signals
-        within a peak region then find subpeak summits. This method is
-        highly recommended for TFBS or DNAase I sites.
-
-        """
-        cdef:
-            int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
-            int32_t start, end, i, j, start_boundary, m, n, l
-            float64_t summit_value, tvalue, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
-            np.ndarray[np.float32_t, ndim=1] peakdata
-            np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
-            int32_t tlist_scores_p
-
-        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-
-        if peak_length < min_length: return  # if the region is too small, reject it
-
-        # Add 10 bp padding to peak region so that we can get true minima
-        end = peak_content[ -1 ][ 1 ] + 10
-        start = peak_content[ 0 ][ 0 ] - 10
-        if start < 0:
-            start_boundary = 10 + start # this is the offset of original peak boundary in peakdata list.
-            start = 0
-        else:
-            start_boundary = 10 # this is the offset of original peak boundary in peakdata list.
-
-        peakdata = np.zeros(end - start, dtype='float32') # save the scores (qscore) for each position in this region
-        peakindices = np.zeros(end - start, dtype='int32') # save the indices for each position in this region
-        for i in range(len(peak_content)):
-            (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
-            tscore = ttreat_p # use pileup as general score to find summit
-            m = tstart - start + start_boundary
-            n = tend - start + start_boundary
-            peakdata[m:n] = tscore
-            peakindices[m:n] = i
-
-        summit_offsets = maxima(peakdata, smoothlen) # offsets are the indices for summits in peakdata/peakindices array.
-
-        if summit_offsets.shape[0] == 0:
-            # **failsafe** if no summits, fall back on old approach #
-            return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)
-        else:
-            # remove maxima that occurred in padding
-            m = np.searchsorted(summit_offsets, start_boundary)
-            n = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
-            summit_offsets = summit_offsets[m:n]
-
-        summit_offsets = enforce_peakyness(peakdata, summit_offsets)
-
-        #print "enforced:",summit_offsets
-        if summit_offsets.shape[0] == 0:
-            # **failsafe** if no summits, fall back on old approach #
-            return self.__close_peak_wo_subpeaks(peak_content, peaks, min_length, chrom, smoothlen, score_array_s, score_cutoff_s)
-
-        summit_indices = peakindices[summit_offsets] # indices are those point to peak_content
-        summit_offsets -= start_boundary
-
-        for summit_offset, summit_index in list(zip(summit_offsets, summit_indices)):
-
-            summit_treat = peak_content[ summit_index ][ 2 ]
-            summit_ctrl = peak_content[ summit_index ][ 3 ]
-
-            summit_p_score = pscore_dict[ ( <int32_t>(summit_treat), summit_ctrl ) ] # get_pscore(( <int32_t>(summit_treat), summit_ctrl ) )
-            summit_q_score = self.pqtable[ summit_p_score ]
-
-            for i in range(len(score_cutoff_s)):
-                if score_cutoff_s[i] > score_array_s[ i ][ peak_content[ summit_index ][ 4 ] ]:
-                    return False # not passed, then disgard this summit.
-
-            peaks.add( chrom,
-                       peak_content[ 0 ][ 0 ],
-                       peak_content[ -1 ][ 1 ],
-                       summit      = start + summit_offset,
-                       peak_score  = summit_q_score,
-                       pileup      = summit_treat,
-                       pscore      = summit_p_score,
-                       fold_change = (summit_treat + self.pseudocount ) / ( summit_ctrl + self.pseudocount ), # fold change
-                       qscore      = summit_q_score
-                       )
-        # start a new peak
-        return True
-
-    cdef np.ndarray __cal_pscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        array1_size = array1.shape[0]
-
-        for i in range(array1_size):
-            s_ptr[0] = get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] ))
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-    cdef np.ndarray __cal_qscore ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        for i in range(array1.shape[0]):
-            s_ptr[0] = self.pqtable[ get_pscore(( <int32_t>(a1_ptr[0]), a2_ptr[0] )) ]
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-    cdef np.ndarray __cal_logLR ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        for i in range(array1.shape[0]):
-            s_ptr[0] = get_logLR_asym( (a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount ) )
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-    cdef np.ndarray __cal_logFE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        for i in range(array1.shape[0]):
-            s_ptr[0] = get_logFE( a1_ptr[0] + self.pseudocount, a2_ptr[0] + self.pseudocount )
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-    cdef np.ndarray __cal_FE ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        for i in range(array1.shape[0]):
-            s_ptr[0] = (a1_ptr[0] + self.pseudocount) / ( a2_ptr[0] + self.pseudocount )
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-    cdef np.ndarray __cal_subtraction ( self, np.ndarray[np.float32_t, ndim=1] array1, np.ndarray[np.float32_t, ndim=1] array2 ):
-        cdef:
-            int64_t i, array1_size
-            np.ndarray[np.float32_t, ndim=1] s
-            float32_t * a1_ptr
-            float32_t * a2_ptr
-            float32_t * s_ptr
-
-        assert array1.shape[0] == array2.shape[0]
-        s = np.zeros(array1.shape[0], dtype="float32")
-
-        a1_ptr = <float32_t *> array1.data
-        a2_ptr = <float32_t *> array2.data
-        s_ptr = <float32_t *> s.data
-
-        for i in range(array1.shape[0]):
-            s_ptr[0] = a1_ptr[0] - a2_ptr[0]
-            s_ptr += 1
-            a1_ptr += 1
-            a2_ptr += 1
-        return s
-
-
-    cdef bool __write_bedGraph_for_a_chromosome ( self, bytes chrom ):
-        """Write treat/control values for a certain chromosome into a
-        specified file handler.
-
-        """
-        cdef:
-            np.ndarray[np.int32_t, ndim=1] pos_array
-            np.ndarray[np.float32_t, ndim=1] treat_array, ctrl_array
-            int32_t * pos_array_ptr
-            float32_t * treat_array_ptr
-            float32_t * ctrl_array_ptr
-            int32_t l, i
-            int32_t p, pre_p_t, pre_p_c # current position, previous position for treat, previous position for control
-            float32_t pre_v_t, pre_v_c, v_t, v_c # previous value for treat, for control, current value for treat, for control
-            float32_t denominator # 1 if save_SPMR is false, or depth in million if save_SPMR is true. Note, while piling up and calling peaks, treatment and control have been scaled to the same depth, so we need to find what this 'depth' is.
-            FILE * ft
-            FILE * fc
-            basestring tmp_bytes
-
-        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
-        pos_array_ptr = <int32_t *> pos_array.data
-        treat_array_ptr = <float32_t *> treat_array.data
-        ctrl_array_ptr = <float32_t *> ctrl_array.data
-
-        if self.save_SPMR:
-            if self.treat_scaling_factor == 1:
-                # in this case, control has been asked to be scaled to depth of treatment
-                denominator  = self.treat.total/1e6
-            else:
-                # in this case, treatment has been asked to be scaled to depth of control
-                denominator  = self.ctrl.total/1e6
-        else:
-            denominator = 1.0
-
-        l = pos_array.shape[ 0 ]
-
-        if l == 0:              # if there is no data, return
-            return False
-
-        ft = self.bedGraph_treat_f
-        fc = self.bedGraph_ctrl_f
-        #t_write_func = self.bedGraph_treat.write
-        #c_write_func = self.bedGraph_ctrl.write
-
-        pre_p_t = 0
-        pre_p_c = 0
-        pre_v_t = treat_array_ptr[ 0 ]/denominator
-        pre_v_c = ctrl_array_ptr [ 0 ]/denominator
-        treat_array_ptr += 1
-        ctrl_array_ptr += 1
-
-        for i in range( 1, l ):
-            v_t = treat_array_ptr[ 0 ]/denominator
-            v_c = ctrl_array_ptr [ 0 ]/denominator
-            p   = pos_array_ptr  [ 0 ]
-            pos_array_ptr += 1
-            treat_array_ptr += 1
-            ctrl_array_ptr += 1
-
-            if abs(pre_v_t - v_t) > 1e-5: # precision is 5 digits
-                fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
-                pre_v_t = v_t
-                pre_p_t = p
-
-            if abs(pre_v_c - v_c) > 1e-5: # precision is 5 digits
-                fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
-                pre_v_c = v_c
-                pre_p_c = p
-
-        p = pos_array_ptr[ 0 ]
-        # last one
-        fprintf( ft, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_t, p, pre_v_t )
-        fprintf( fc, b"%s\t%d\t%d\t%.5f\n", chrom, pre_p_c, p, pre_v_c )
-
-        return True
-
-    cpdef call_broadpeaks (self, list scoring_function_symbols, list lvl1_cutoff_s, list lvl2_cutoff_s, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400, bool cutoff_analysis = False):
-        """This function try to find enriched regions within which,
-        scores are continuously higher than a given cutoff for level
-        1, and link them using the gap above level 2 cutoff with a
-        maximum length of lvl2_max_gap.
-
-        scoring_function_s: symbols of functions to calculate score. 'p' for pscore, 'q' for qscore, 'f' for fold change, 's' for subtraction. for example: ['p', 'q']
-
-        lvl1_cutoff_s:  list of cutoffs at highly enriched regions, corresponding to scoring functions.
-        lvl2_cutoff_s:  list of cutoffs at less enriched regions, corresponding to scoring functions.
-        min_length :    minimum peak length, default 200.
-        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
-        lvl2_max_gap   :  maximum length of linkage regions, default 400.
-
-        Return both general PeakIO object for highly enriched regions
-        and gapped broad regions in BroadPeakIO.
-        """
-        cdef:
-            int32_t i, j
-            bytes chrom
-            object lvl1peaks, lvl1peakschrom, lvl1
-            object lvl2peaks, lvl2peakschrom, lvl2
-            object broadpeaks
-            set chrs
-            list tmppeakset
-
-        lvl1peaks = PeakIO()
-        lvl2peaks = PeakIO()
-
-        # prepare p-q table
-        if len( self.pqtable ) == 0:
-            info("#3 Pre-compute pvalue-qvalue table...")
-            if cutoff_analysis:
-                info("#3 Cutoff value vs broad region calls will be analyzed!")
-                self.__pre_computes( max_gap = lvl2_max_gap, min_length = min_length )
-            else:
-                self.__cal_pvalue_qvalue_table()
-
-        # prepare bedGraph file
-        if self.save_bedGraph:
-
-            self.bedGraph_treat_f = fopen( self.bedGraph_treat_filename, "w" )
-            self.bedGraph_ctrl_f = fopen( self.bedGraph_control_filename, "w" )
-            info ("#3 In the peak calling step, the following will be performed simultaneously:")
-            info ("#3   Write bedGraph files for treatment pileup (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_treat_pileup.bdg")
-            info ("#3   Write bedGraph files for control lambda (after scaling if necessary)... %s" % self.bedGraph_filename_prefix.decode() + "_control_lambda.bdg")
-
-            if self.trackline:
-                # this line is REQUIRED by the wiggle format for UCSC browser
-                tmp_bytes = ("track type=bedGraph name=\"treatment pileup\" description=\"treatment pileup after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
-                fprintf( self.bedGraph_treat_f, tmp_bytes )
-                tmp_bytes = ("track type=bedGraph name=\"control lambda\" description=\"control lambda after possible scaling for \'%s\'\"\n" % self.bedGraph_filename_prefix).encode()
-                fprintf( self.bedGraph_ctrl_f, tmp_bytes )
-
-
-        info("#3 Call peaks for each chromosome...")
-        for chrom in self.chromosomes:
-            self.__chrom_call_broadpeak_using_certain_criteria ( lvl1peaks, lvl2peaks, chrom, scoring_function_symbols, lvl1_cutoff_s, lvl2_cutoff_s, min_length, lvl1_max_gap, lvl2_max_gap, self.save_bedGraph )
-
-        # close bedGraph file
-        if self.save_bedGraph:
-            fclose( self.bedGraph_treat_f )
-            fclose( self.bedGraph_ctrl_f )
-            #self.bedGraph_ctrl.close()
-            self.save_bedGraph = False
-
-        # now combine lvl1 and lvl2 peaks
-        chrs = lvl1peaks.get_chr_names()
-        broadpeaks = BroadPeakIO()
-        # use lvl2_peaks as linking regions between lvl1_peaks
-        for chrom in sorted(chrs):
-            lvl1peakschrom = lvl1peaks.get_data_from_chrom(chrom)
-            lvl2peakschrom = lvl2peaks.get_data_from_chrom(chrom)
-            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
-            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
-            # our assumption is lvl1 regions should be included in lvl2 regions
-            try:
-                lvl1 = lvl1peakschrom_next()
-                for i in range( len(lvl2peakschrom) ):
-                    # for each lvl2 peak, find all lvl1 peaks inside
-                    # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
-                    lvl2 = lvl2peakschrom[i]
-
-                    while True:
-                        if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
-                            tmppeakset.append(lvl1)
-                            lvl1 = lvl1peakschrom_next()
-                        else:
-                            # make a hierarchical broad peak
-                            #print lvl2["start"], lvl2["end"], lvl2["score"]
-                            self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                            tmppeakset = []
-                            break
-            except StopIteration:
-                # no more strong (aka lvl1) peaks left
-                self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                tmppeakset = []
-                # add the rest lvl2 peaks
-                for j in range( i+1, len(lvl2peakschrom) ):
-                    self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )
-
-        return broadpeaks
-
-    cdef void  __chrom_call_broadpeak_using_certain_criteria ( self, lvl1peaks, lvl2peaks, bytes chrom, list scoring_function_s, list lvl1_cutoff_s, list lvl2_cutoff_s,
-                                                         int32_t min_length, int32_t lvl1_max_gap, int32_t lvl2_max_gap, bool save_bedGraph):
-        """ Call peaks for a chromosome.
-
-        Combination of criteria is allowed here.
-
-        peaks: a PeakIO object
-        scoring_function_s: symbols of functions to calculate score as score=f(x, y) where x is treatment pileup, and y is control pileup
-        save_bedGraph     : whether or not to save pileup and control into a bedGraph file
-        """
-        cdef:
-            int32_t i
-            str s
-            np.ndarray above_cutoff, above_cutoff_endpos, above_cutoff_startpos
-            np.ndarray pos_array, treat_array, ctrl_array
-            np.ndarray above_cutoff_index_array
-            list score_array_s          # list to keep different types of scores
-            list peak_content
-            int32_t * acs_ptr
-            int32_t * ace_ptr
-            int32_t * acia_ptr
-            float32_t * treat_array_ptr
-            float32_t * ctrl_array_ptr
-
-        assert len(scoring_function_s) == len(lvl1_cutoff_s), "number of functions and cutoffs should be the same!"
-        assert len(scoring_function_s) == len(lvl2_cutoff_s), "number of functions and cutoffs should be the same!"
-
-        # first, build pileup, self.chr_pos_treat_ctrl
-        self.__pileup_treat_ctrl_a_chromosome( chrom )
-        [pos_array, treat_array, ctrl_array] = self.chr_pos_treat_ctrl
-
-        # while save_bedGraph is true, invoke __write_bedGraph_for_a_chromosome
-        if save_bedGraph:
-            self.__write_bedGraph_for_a_chromosome ( chrom )
-
-        # keep all types of scores needed
-        score_array_s = []
-        for i in range(len(scoring_function_s)):
-            s = scoring_function_s[i]
-            if s == 'p':
-                score_array_s.append( self.__cal_pscore( treat_array, ctrl_array ) )
-            elif s == 'q':
-                score_array_s.append( self.__cal_qscore( treat_array, ctrl_array ) )
-            elif s == 'f':
-                score_array_s.append( self.__cal_FE( treat_array, ctrl_array ) )
-            elif s == 's':
-                score_array_s.append( self.__cal_subtraction( treat_array, ctrl_array ) )
-
-        # lvl1 : strong peaks
-        peak_content = []           # to store points above cutoff
-
-        # get the regions with scores above cutoffs
-        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl1_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
-        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
-
-        if above_cutoff.size == 0:
-            # nothing above cutoff
-            return
-
-        if above_cutoff[0] == 0:
-            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
-            above_cutoff_startpos[0] = 0
-
-        # first bit of region above cutoff
-        acs_ptr = <int32_t *>above_cutoff_startpos.data
-        ace_ptr = <int32_t *>above_cutoff_endpos.data
-        acia_ptr= <int32_t *>above_cutoff_index_array.data
-        treat_array_ptr = <float32_t *> treat_array.data
-        ctrl_array_ptr = <float32_t *> ctrl_array.data
-
-        ts = acs_ptr[ 0 ]
-        te = ace_ptr[ 0 ]
-        ti = acia_ptr[ 0 ]
-        tp = treat_array_ptr[ ti ]
-        cp = ctrl_array_ptr[ ti ]
-
-        peak_content.append( ( ts, te, tp, cp, ti ) )
-        acs_ptr += 1 # move ptr
-        ace_ptr += 1
-        acia_ptr+= 1
-        lastp = te
-
-        #peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], treat_array[above_cutoff_index_array[0]], ctrl_array[above_cutoff_index_array[0]], score_array_s, above_cutoff_index_array[0] ) )
-        for i in range( 1, above_cutoff_startpos.size ):
-            ts = acs_ptr[ 0 ]
-            te = ace_ptr[ 0 ]
-            ti = acia_ptr[ 0 ]
-            acs_ptr += 1
-            ace_ptr += 1
-            acia_ptr+= 1
-            tp = treat_array_ptr[ ti ]
-            cp = ctrl_array_ptr[ ti ]
-            tl = ts - lastp
-            if tl <= lvl1_max_gap:
-                # append
-                #peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i] ) )
-                peak_content.append( ( ts, te, tp, cp, ti ) )
-                lastp = te
-            else:
-                # close
-                self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )
-                #peak_content = [ (above_cutoff_startpos[i], above_cutoff_endpos[i], treat_array[above_cutoff_index_array[i]], ctrl_array[above_cutoff_index_array[i]], score_array_s, above_cutoff_index_array[i]) , ]
-                peak_content = [ ( ts, te, tp, cp, ti ), ]
-                lastp = te #above_cutoff_endpos[i]
-
-        # save the last peak
-        if peak_content:
-            self.__close_peak_for_broad_region (peak_content, lvl1peaks, min_length, chrom, lvl1_max_gap//2, score_array_s )
-
-        # lvl2 : weak peaks
-        peak_content = []           # to store points above cutoff
-
-        # get the regions with scores above cutoffs
-        above_cutoff = np.nonzero( apply_multiple_cutoffs(score_array_s,lvl2_cutoff_s) )[0] # this is not an optimized method. It would be better to store score array in a 2-D ndarray?
-        above_cutoff_index_array = np.arange(pos_array.shape[0],dtype="int32")[above_cutoff] # indices
-        above_cutoff_endpos = pos_array[above_cutoff] # end positions of regions where score is above cutoff
-        above_cutoff_startpos = pos_array[above_cutoff-1] # start positions of regions where score is above cutoff
-
-        if above_cutoff.size == 0:
-            # nothing above cutoff
-            return
-
-        if above_cutoff[0] == 0:
-            # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
-            above_cutoff_startpos[0] = 0
-
-        # first bit of region above cutoff
-        acs_ptr = <int32_t *>above_cutoff_startpos.data
-        ace_ptr = <int32_t *>above_cutoff_endpos.data
-        acia_ptr= <int32_t *>above_cutoff_index_array.data
-        treat_array_ptr = <float32_t *> treat_array.data
-        ctrl_array_ptr = <float32_t *> ctrl_array.data
-
-        ts = acs_ptr[ 0 ]
-        te = ace_ptr[ 0 ]
-        ti = acia_ptr[ 0 ]
-        tp = treat_array_ptr[ ti ]
-        cp = ctrl_array_ptr[ ti ]
-        peak_content.append( ( ts, te, tp, cp, ti ) )
-        acs_ptr += 1 # move ptr
-        ace_ptr += 1
-        acia_ptr+= 1
-
-        lastp = te
-        for i in range( 1, above_cutoff_startpos.size ):
-            # for everything above cutoff
-            ts = acs_ptr[ 0 ] # get the start
-            te = ace_ptr[ 0 ] # get the end
-            ti = acia_ptr[ 0 ]# get the index
-
-            acs_ptr += 1 # move ptr
-            ace_ptr += 1
-            acia_ptr+= 1
-            tp = treat_array_ptr[ ti ] # get the treatment pileup
-            cp = ctrl_array_ptr[ ti ]  # get the control pileup
-            tl = ts - lastp # get the distance from the current point to last position of existing peak_content
-
-            if tl <= lvl2_max_gap:
-                # append
-                peak_content.append( ( ts, te, tp, cp, ti ) )
-                lastp = te
-            else:
-                # close
-                self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )
-
-                peak_content = [ ( ts, te, tp, cp, ti ), ]
-                lastp = te
-
-        # save the last peak
-        if peak_content:
-            self.__close_peak_for_broad_region (peak_content, lvl2peaks, min_length, chrom, lvl2_max_gap//2, score_array_s )
-
-        return
-
-    cdef bool __close_peak_for_broad_region (self, list peak_content, peaks, int32_t min_length,
-                                             bytes chrom, int32_t smoothlen, list score_array_s, list score_cutoff_s=[]):
-        """Close the broad peak region, output peak boundaries, peak summit
-        and scores, then add the peak to peakIO object.
-
-        peak_content contains [start, end, treat_p, ctrl_p, list_scores]
-
-        peaks: a BroadPeakIO object
-
-        """
-        cdef:
-            int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
-            float64_t treat_v, ctrl_v, tsummitvalue, ttreat_p, tctrl_p, tscore, summit_treat, summit_ctrl, summit_p_score, summit_q_score
-            list tlist_pileup, tlist_control, tlist_length
-            int32_t tlist_scores_p
-            np.ndarray tarray_pileup, tarray_control, tarray_pscore, tarray_qscore, tarray_fc
-
-        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-        if peak_length >= min_length: # if the peak is too small, reject it
-            tlist_pileup = []
-            tlist_control= []
-            tlist_length = []
-            for i in range(len(peak_content)): # each position in broad peak
-                (tstart, tend, ttreat_p, tctrl_p, tlist_scores_p) = peak_content[i]
-                tlist_pileup.append( ttreat_p )
-                tlist_control.append( tctrl_p )
-                tlist_length.append( tend - tstart )
-
-            tarray_pileup = np.array( tlist_pileup, dtype="float32")
-            tarray_control = np.array( tlist_control, dtype="float32")
-            tarray_pscore = self.__cal_pscore( tarray_pileup, tarray_control )
-            tarray_qscore = self.__cal_qscore( tarray_pileup, tarray_control )
-            tarray_fc     = self.__cal_FE    ( tarray_pileup, tarray_control )
-
-            peaks.add( chrom,           # chromosome
-                       peak_content[0][0], # start
-                       peak_content[-1][1], # end
-                       summit = 0,
-                       peak_score  = mean_from_value_length( tarray_qscore, tlist_length ),
-                       pileup      = mean_from_value_length( tarray_pileup, tlist_length ),
-                       pscore      = mean_from_value_length( tarray_pscore, tlist_length ),
-                       fold_change = mean_from_value_length( tarray_fc, tlist_length ),
-                       qscore      = mean_from_value_length( tarray_qscore, tlist_length ),
-                       )
-            #if chrom == "chr1" and  peak_content[0][0] == 237643 and peak_content[-1][1] == 237935:
-            #    print tarray_qscore, tlist_length
-            # start a new peak
-            return True
-
-    cdef __add_broadpeak (self, bpeaks, bytes chrom, object lvl2peak, list lvl1peakset):
-        """Internal function to create broad peak.
-
-        *Note* lvl1peakset/strong_regions might be empty
-        """
-
-        cdef:
-            int32_t blockNum, start, end
-            bytes blockSizes, blockStarts, thickStart, thickEnd,
-
-        start      = lvl2peak["start"]
-        end        = lvl2peak["end"]
-
-        if not lvl1peakset:
-            # will complement by adding 1bps start and end to this region
-            # may change in the future if gappedPeak format was improved.
-            bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
-                       blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
-                       pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                       qscore = lvl2peak["qscore"] )
-            return bpeaks
-
-        thickStart = b"%d" % (lvl1peakset[0]["start"])
-        thickEnd   = b"%d" % (lvl1peakset[-1]["end"])
-        blockNum   = len(lvl1peakset)
-        blockSizes = b",".join([b"%d" % y for y in [x["length"] for x in lvl1peakset]])
-        blockStarts = b",".join([b"%d" % x for x in getitem_then_subtract(lvl1peakset, start)])
-
-        # add 1bp left and/or right block if necessary
-        if int(thickStart) != start:
-            # add 1bp left block
-            thickStart = b"%d" % start
-            blockNum += 1
-            blockSizes = b"1,"+blockSizes
-            blockStarts = b"0,"+blockStarts
-        if int(thickEnd) != end:
-            # add 1bp right block
-            thickEnd = b"%d" % end
-            blockNum += 1
-            blockSizes = blockSizes + b",1"
-            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
-
-        bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
-                   blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts, pileup = lvl2peak["pileup"],
-                   pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                   qscore = lvl2peak["qscore"] )
-        return bpeaks
-
- 
diff --git a/MACS3/Signal/FixWidthTrack.py b/MACS3/Signal/FixWidthTrack.py
new file mode 100644
index 00000000..9774c236
--- /dev/null
+++ b/MACS3/Signal/FixWidthTrack.py
@@ -0,0 +1,699 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-14 14:53:06 Tao Liu>
+
+"""Module for FWTrack classes.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+import sys
+import io
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+
+from MACS3.IO.PeakIO import PeakIO
+from MACS3.Signal.Pileup import se_all_in_one_pileup, over_two_pv_array
+
+# ------------------------------------
+# Other modules
+# ------------------------------------
+import cython
+import numpy as np
+from cython.cimports.cpython import bool
+import cython.cimports.numpy as cnp
+from cython.cimports.libc.stdint import INT32_MAX as INT_MAX
+
+# ------------------------------------
+# constants
+# ------------------------------------
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class FWTrack:
+    """Fixed Width Locations Track class  along the whole genome
+    (commonly with the same annotation type), which are stored in a
+    dict.
+
+    Locations are stored and organized by sequence names (chr names) in a
+    dict. They can be sorted by calling self.sort() function.
+    """
+    locations: dict
+    pointer: dict
+    buf_size: dict
+    rlengths: dict
+    is_sorted: bool
+    is_destroyed: bool
+    total = cython.declare(cython.ulong, visibility="public")
+    annotation = cython.declare(str, visibility="public")
+    buffer_size = cython.declare(cython.long, visibility="public")
+    length = cython.declare(cython.long, visibility="public")
+    fw = cython.declare(cython.int, visibility="public")
+
+    def __init__(self,
+                 fw: cython.int = 0,
+                 anno: str = "",
+                 buffer_size: cython.long = 100000):
+        """fw is the fixed-width for all locations.
+
+        """
+        self.fw = fw
+        self.locations = {}    # location pairs: two strands
+        self.pointer = {}      # location pairs
+        self.buf_size = {}     # location pairs
+        self.is_sorted = False
+        self.total = 0           # total tags
+        self.annotation = anno   # need to be figured out
+        # lengths of reference sequences, e.g. each chromosome in a genome
+        self.rlengths = {}
+        self.buffer_size = buffer_size
+        self.length = 0
+        self.is_destroyed = False
+
+    @cython.ccall
+    def destroy(self):
+        """Destroy this object and release mem.
+        """
+        chrs: set
+        chromosome: bytes
+
+        chrs = self.get_chr_names()
+        for chromosome in sorted(chrs):
+            if chromosome in self.locations:
+                self.locations[chromosome][0].resize(self.buffer_size,
+                                                     refcheck=False)
+                self.locations[chromosome][0].resize(0,
+                                                     refcheck=False)
+                self.locations[chromosome][1].resize(self.buffer_size,
+                                                     refcheck=False)
+                self.locations[chromosome][1].resize(0,
+                                                     refcheck=False)
+                self.locations[chromosome] = [None, None]
+                self.locations.pop(chromosome)
+        self.is_destroyed = True
+        return
+
+    @cython.ccall
+    def add_loc(self,
+                chromosome: bytes,
+                fiveendpos: cython.int,
+                strand: cython.int):
+        """Add a location to the list according to the sequence name.
+
+        chromosome -- mostly the chromosome name
+        fiveendpos -- 5' end pos, left for plus strand, right for minus strand
+        strand     -- 0: plus, 1: minus
+        """
+        i: cython.int
+        b: cython.int
+        arr: cnp.ndarray
+
+        if chromosome not in self.locations:
+            self.buf_size[chromosome] = [self.buffer_size, self.buffer_size]
+            self.locations[chromosome] = [np.zeros(self.buffer_size, dtype='i4'),
+                                          np.zeros(self.buffer_size, dtype='i4')]
+            self.pointer[chromosome] = [0, 0]
+            self.locations[chromosome][strand][0] = fiveendpos
+            self.pointer[chromosome][strand] = 1
+        else:
+            i = self.pointer[chromosome][strand]
+            b = self.buf_size[chromosome][strand]
+            arr = self.locations[chromosome][strand]
+            if b == i:
+                b += self.buffer_size
+                arr.resize(b, refcheck=False)
+                self.buf_size[chromosome][strand] = b
+            arr[i] = fiveendpos
+            self.pointer[chromosome][strand] += 1
+        return
+
+    @cython.ccall
+    def finalize(self):
+        """ Resize np arrays for 5' positions and sort them in place
+
+        Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
+        """
+        c: bytes
+        chrnames: set
+
+        self.total = 0
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.locations[c][0].resize(self.pointer[c][0], refcheck=False)
+            self.locations[c][0].sort()
+            self.locations[c][1].resize(self.pointer[c][1], refcheck=False)
+            self.locations[c][1].sort()
+            self.total += self.locations[c][0].size + self.locations[c][1].size
+
+        self.is_sorted = True
+        self.length = self.fw * self.total
+        return
+
+    @cython.ccall
+    def set_rlengths(self, rlengths: dict) -> bool:
+        """Set reference chromosome lengths dictionary.
+
+        Only the chromosome existing in this fwtrack object will be updated.
+
+        If chromosome in this fwtrack is not covered by given
+        rlengths, and it has no associated length, it will be set as
+        maximum integer.
+
+        """
+        valid_chroms: set
+        missed_chroms: set
+        chrom: bytes
+
+        valid_chroms = set(self.locations.keys()).intersection(rlengths.keys())
+        for chrom in sorted(valid_chroms):
+            self.rlengths[chrom] = rlengths[chrom]
+        missed_chroms = set(self.locations.keys()).difference(rlengths.keys())
+        for chrom in sorted(missed_chroms):
+            self.rlengths[chrom] = INT_MAX
+        return True
+
+    @cython.ccall
+    def get_rlengths(self) -> dict:
+        """Get reference chromosome lengths dictionary.
+
+        If self.rlength is empty, create a new dict where the length of
+        chromosome will be set as the maximum integer.
+        """
+        if not self.rlengths:
+            self.rlengths = dict([(k, INT_MAX) for k in self.locations.keys()])
+        return self.rlengths
+
+    @cython.ccall
+    def get_locations_by_chr(self, chromosome: bytes):
+        """Return a tuple of two lists of locations for certain chromosome.
+
+        """
+        if chromosome in self.locations:
+            return self.locations[chromosome]
+        else:
+            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names stored in this track object.
+        """
+        return set(sorted(self.locations.keys()))
+
+    @cython.ccall
+    def sort(self):
+        """Naive sorting for locations.
+
+        """
+        c: bytes
+        chrnames: set
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.locations[c][0].sort()
+            self.locations[c][1].sort()
+
+        self.is_sorted = True
+        return
+
+    @cython.boundscheck(False)  # do not check that np indices are valid
+    @cython.ccall
+    def filter_dup(self, maxnum: cython.int = -1) -> cython.ulong:
+        """Filter the duplicated reads.
+
+        Run it right after you add all data into this object.
+
+        Note, this function will *throw out* duplicates
+        permenantly. If you want to keep them, use separate_dups
+        instead.
+        """
+        p: cython.int
+        n: cython.int
+        current_loc: cython.int
+        # index for old array, and index for new one
+        i_old: cython.ulong
+        i_new: cython.ulong
+        size: cython.ulong
+        k: bytes
+        plus: cnp.ndarray(cython.int, ndim=1)
+        new_plus: cnp.ndarray(cython.int, ndim=1)
+        minus: cnp.ndarray(cython.int, ndim=1)
+        new_minus: cnp.ndarray(cython.int, ndim=1)
+        chrnames: set
+
+        if maxnum < 0:
+            return self.total         # do nothing
+
+        if not self.is_sorted:
+            self.sort()
+
+        self.total = 0
+        self.length = 0
+
+        chrnames = self.get_chr_names()
+
+        for k in chrnames:
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+
+            # + strand
+            i_new = 0
+            plus = self.locations[k][0]
+            size = plus.shape[0]
+            if len(plus) <= 1:
+                new_plus = plus         # do nothing
+            else:
+                new_plus = np.zeros(self.pointer[k][0] + 1, dtype='i4')
+                new_plus[i_new] = plus[i_new]  # first item
+                i_new += 1
+                # the number of tags in the current location
+                n = 1
+                current_loc = plus[0]
+                for i_old in range(1, size):
+                    p = plus[i_old]
+                    if p == current_loc:
+                        n += 1
+                    else:
+                        current_loc = p
+                        n = 1
+                    if n <= maxnum:
+                        new_plus[i_new] = p
+                        i_new += 1
+                new_plus.resize(i_new, refcheck=False)
+                self.total += i_new
+                self.pointer[k][0] = i_new
+                # free memory?
+                # I know I should shrink it to 0 size directly,
+                # however, on Mac OSX, it seems directly assigning 0
+                # doesn't do a thing.
+                plus.resize(self.buffer_size, refcheck=False)
+                plus.resize(0, refcheck=False)
+                # hope there would be no mem leak...
+
+            # - strand
+            i_new = 0
+            minus = self.locations[k][1]
+            size = minus.shape[0]
+            if len(minus) <= 1:
+                new_minus = minus         # do nothing
+            else:
+                new_minus = np.zeros(self.pointer[k][1] + 1,
+                                     dtype='i4')
+                new_minus[i_new] = minus[i_new]  # first item
+                i_new += 1
+                # the number of tags in the current location
+                n = 1
+                current_loc = minus[0]
+                for i_old in range(1, size):
+                    p = minus[i_old]
+                    if p == current_loc:
+                        n += 1
+                    else:
+                        current_loc = p
+                        n = 1
+                    if n <= maxnum:
+                        new_minus[i_new] = p
+                        i_new += 1
+                new_minus.resize(i_new, refcheck=False)
+                self.total += i_new
+                self.pointer[k][1] = i_new
+                # free memory ?
+                # I know I should shrink it to 0 size directly,
+                # however, on Mac OSX, it seems directly assigning 0
+                # doesn't do a thing.
+                minus.resize(self.buffer_size, refcheck=False)
+                minus.resize(0, refcheck=False)
+                # hope there would be no mem leak...
+
+            self.locations[k] = [new_plus, new_minus]
+
+        self.length = self.fw * self.total
+        return self.total
+
+    @cython.ccall
+    def sample_percent(self, percent: cython.float, seed: cython.int = -1):
+        """Sample the tags for a given percentage.
+
+        Warning: the current object is changed!
+        """
+        num: cython.int  # num: number of reads allowed on a certain chromosome
+        k: bytes
+        chrnames: set
+
+        self.total = 0
+        self.length = 0
+
+        chrnames = self.get_chr_names()
+
+        if seed >= 0:
+            np.random.seed(seed)
+
+        for k in chrnames:
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+
+            num = cython.cast(cython.int,
+                              round(self.locations[k][0].shape[0] * percent, 5))
+            np.random.shuffle(self.locations[k][0])
+            self.locations[k][0].resize(num, refcheck=False)
+            self.locations[k][0].sort()
+            self.pointer[k][0] = self.locations[k][0].shape[0]
+
+            num = cython.cast(cython.int,
+                              round(self.locations[k][1].shape[0] * percent, 5))
+            np.random.shuffle(self.locations[k][1])
+            self.locations[k][1].resize(num, refcheck=False)
+            self.locations[k][1].sort()
+            self.pointer[k][1] = self.locations[k][1].shape[0]
+
+            self.total += self.pointer[k][0] + self.pointer[k][1]
+
+        self.length = self.fw * self.total
+        return
+
+    @cython.ccall
+    def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1):
+        """Sample the tags for a given percentage.
+
+        Warning: the current object is changed!
+        """
+        percent: cython.float
+
+        percent = cython.cast(cython.float, samplesize) / self.total
+        self.sample_percent(percent, seed)
+        return
+
+    @cython.ccall
+    def print_to_bed(self, fhd=None):
+        """Output FWTrack to BED format files. If fhd is given,
+        write to a file, otherwise, output to standard output.
+
+        """
+        i: cython.int
+        p: cython.int
+        k: bytes
+        chrnames: set
+
+        if not fhd:
+            fhd = sys.stdout
+        assert isinstance(fhd, io.IOBase)
+        assert self.fw > 0, "FWTrack object .fw should be set larger than 0!"
+
+        chrnames = self.get_chr_names()
+
+        for k in chrnames:
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+
+            plus = self.locations[k][0]
+
+            for i in range(plus.shape[0]):
+                p = plus[i]
+                fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),
+                                                      p,
+                                                      p + self.fw,
+                                                      "+"))
+
+            minus = self.locations[k][1]
+
+            for i in range(minus.shape[0]):
+                p = minus[i]
+                fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),
+                                                      p-self.fw,
+                                                      p,
+                                                      "-"))
+        return
+
+    @cython.ccall
+    def extract_region_tags(self, chromosome: bytes,
+                            startpos: cython.int, endpos: cython.int) -> tuple:
+        i: cython.int
+        pos: cython.int
+        rt_plus: np.ndarray(cython.int, ndim=1)
+        rt_minus: np.ndarray(cython.int, ndim=1)
+        temp: list
+        chrnames: set
+
+        if not self.is_sorted:
+            self.sort()
+
+        chrnames = self.get_chr_names()
+        assert chromosome in chrnames, "chromosome %s can't be found in the FWTrack object." % chromosome
+
+        (plus, minus) = self.locations[chromosome]
+
+        temp = []
+        for i in range(plus.shape[0]):
+            pos = plus[i]
+            if pos < startpos:
+                continue
+            elif pos > endpos:
+                break
+            else:
+                temp.append(pos)
+        rt_plus = np.array(temp)
+
+        temp = []
+        for i in range(minus.shape[0]):
+            pos = minus[i]
+            if pos < startpos:
+                continue
+            elif pos > endpos:
+                break
+            else:
+                temp.append(pos)
+        rt_minus = np.array(temp)
+        return (rt_plus, rt_minus)
+
+    @cython.ccall
+    def compute_region_tags_from_peaks(self, peaks: PeakIO,
+                                       func,
+                                       window_size: cython.int = 100,
+                                       cutoff: cython.float = 5.0) -> list:
+        """Extract tags in peak, then apply func on extracted tags.
+
+        peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
+
+        func:  a function to compute *something* from tags found in a predefined region
+
+        window_size: this will be passed to func.
+
+        cutoff: this will be passed to func.
+
+        func needs the fixed number of parameters, so it's not flexible. Here is an example:
+
+        wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff):
+
+        """
+        m: cython.int
+        i: cython.int
+        j: cython.int
+        pos: cython.int
+        startpos: cython.int
+        endpos: cython.int
+
+        plus: cnp.ndarray(cython.int, ndim=1)
+        minus: cnp.ndarray(cython.int, ndim=1)
+        rt_plus: cnp.ndarray(cython.int, ndim=1)
+        rt_minus: cnp.ndarray(cython.int, ndim=1)
+
+        chrom: bytes
+        name: bytes
+
+        temp: list
+        retval: list
+        pchrnames: set
+        chrnames: set
+
+        pchrnames = peaks.get_chr_names()
+        retval = []
+
+        # this object should be sorted
+        if not self.is_sorted:
+            self.sort()
+        # PeakIO object should be sorted
+        peaks.sort()
+
+        chrnames = self.get_chr_names()
+
+        for chrom in sorted(pchrnames):
+            assert chrom in chrnames, "chromosome %s can't be found in the FWTrack object." % chrom
+            (plus, minus) = self.locations[chrom]
+            cpeaks = peaks.get_data_from_chrom(chrom)
+            prev_i = 0
+            prev_j = 0
+            for m in range(len(cpeaks)):
+                startpos = cpeaks[m]["start"] - window_size
+                endpos = cpeaks[m]["end"] + window_size
+                name = cpeaks[m]["name"]
+
+                temp = []
+                for i in range(prev_i, plus.shape[0]):
+                    pos = plus[i]
+                    if pos < startpos:
+                        continue
+                    elif pos > endpos:
+                        prev_i = i
+                        break
+                    else:
+                        temp.append(pos)
+                rt_plus = np.array(temp, dtype="i4")
+
+                temp = []
+                for j in range(prev_j, minus.shape[0]):
+                    pos = minus[j]
+                    if pos < startpos:
+                        continue
+                    elif pos > endpos:
+                        prev_j = j
+                        break
+                    else:
+                        temp.append(pos)
+                rt_minus = np.array(temp, dtype="i4")
+
+                retval.append(func(chrom, rt_plus, rt_minus, startpos, endpos,
+                                   name=name,
+                                   window_size=window_size,
+                                   cutoff=cutoff))
+                # rewind window_size
+                for i in range(prev_i, 0, -1):
+                    if plus[prev_i] - plus[i] >= window_size:
+                        break
+                prev_i = i
+
+                for j in range(prev_j, 0, -1):
+                    if minus[prev_j] - minus[j] >= window_size:
+                        break
+                prev_j = j
+                # end of a loop
+
+        return retval
+
+    @cython.ccall
+    def pileup_a_chromosome(self, chrom: bytes, ds: list,
+                            scale_factor_s: list,
+                            baseline_value: cython.float = 0.0,
+                            directional: bool = True,
+                            end_shift: cython.int = 0) -> list:
+        """pileup a certain chromosome, return [p,v] (end position and
+        value) list.
+
+        ds : tag will be extended to this value to 3' direction,
+             unless directional is False. Can contain multiple
+             extension values. Final pileup will the maximum.
+
+        scale_factor_s : linearly scale the pileup value applied to
+                         each d in ds. The list should have the same
+                         length as ds.
+
+        baseline_value : a value to be filled for missing values, and
+                         will be the minimum pileup.
+
+        directional : if False, the strand or direction of tag will be
+                      ignored, so that extension will be both sides
+                      with d/2.
+
+        end_shift : move cutting ends towards 5->3 direction if value
+                    is positive, or towards 3->5 direction if
+                    negative. Default is 0 -- no shift at all.
+
+        p and v are numpy.ndarray objects.
+        """
+        d: cython.long
+        five_shift: cython.long
+        # adjustment to 5' end and 3' end positions to make a fragment
+        three_shift: cython.long
+        rlength: cython.long
+        chrlengths: dict
+        five_shift_s: list = []
+        three_shift_s: list = []
+        tmp_pileup: list
+        prev_pileup: list
+
+        chrlengths = self.get_rlengths()
+        rlength = chrlengths[chrom]
+        assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
+
+        # adjust extension length according to 'directional' and
+        # 'halfextension' setting.
+        for d in ds:
+            if directional:
+                # only extend to 3' side
+                five_shift_s.append(- end_shift)
+                three_shift_s.append(end_shift + d)
+            else:
+                # both sides
+                five_shift_s.append(d//2 - end_shift)
+                three_shift_s.append(end_shift + d - d//2)
+
+        prev_pileup = None
+
+        for i in range(len(ds)):
+            five_shift = five_shift_s[i]
+            three_shift = three_shift_s[i]
+            scale_factor = scale_factor_s[i]
+            tmp_pileup = se_all_in_one_pileup(self.locations[chrom][0],
+                                              self.locations[chrom][1],
+                                              five_shift,
+                                              three_shift,
+                                              rlength,
+                                              scale_factor,
+                                              baseline_value)
+
+            if prev_pileup:
+                prev_pileup = over_two_pv_array(prev_pileup,
+                                                tmp_pileup,
+                                                func="max")
+            else:
+                prev_pileup = tmp_pileup
+
+        return prev_pileup
+
+
+@cython.inline
+@cython.cfunc
+def left_sum(data,
+             pos: cython.int,
+             width: cython.int) -> cython.int:
+    return sum([data[x] for x in data if x <= pos and x >= pos - width])
+
+
+@cython.inline
+@cython.cfunc
+def right_sum(data,
+              pos: cython.int,
+              width: cython.int) -> cython.int:
+    return sum([data[x] for x in data if x >= pos and x <= pos + width])
+
+
+@cython.inline
+@cython.cfunc
+def left_forward(data,
+                 pos: cython.int,
+                 window_size: cython.int) -> cython.int:
+    return data.get(pos, 0) - data.get(pos-window_size, 0)
+
+
+@cython.inline
+@cython.cfunc
+def right_forward(data,
+                  pos: cython.int,
+                  window_size: cython.int) -> cython.int:
+    return data.get(pos + window_size, 0) - data.get(pos, 0)
diff --git a/MACS3/Signal/FixWidthTrack.pyx b/MACS3/Signal/FixWidthTrack.pyx
deleted file mode 100644
index 077d6324..00000000
--- a/MACS3/Signal/FixWidthTrack.pyx
+++ /dev/null
@@ -1,608 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2022-09-15 17:17:37 Tao Liu>
-
-"""Module for FWTrack classes.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-import sys
-import io
-from copy import copy
-from collections import Counter
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-
-from MACS3.Utilities.Constants import *
-from MACS3.Signal.SignalProcessing import *
-from MACS3.IO.PeakIO import PeakIO
-from MACS3.Signal.Pileup import se_all_in_one_pileup, over_two_pv_array
-
-# ------------------------------------
-# Other modules
-# ------------------------------------
-from cpython cimport bool
-cimport cython
-import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "FixWidthTrack $Revision$"
-__author__ = "Tao Liu <taoliu@jimmy.harvard.edu>"
-__doc__ = "FWTrack class"
-
-cdef INT_MAX = <int32_t>((<uint32_t>(-1))>>1)
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-
-cdef class FWTrack:
-    """Fixed Width Locations Track class  along the whole genome
-    (commonly with the same annotation type), which are stored in a
-    dict.
-
-    Locations are stored and organized by sequence names (chr names) in a
-    dict. They can be sorted by calling self.sort() function.
-    """
-    cdef:
-        dict __locations
-        dict __pointer
-        dict __buf_size
-        bool __sorted
-        bool __destroyed
-        dict rlengths
-        public int64_t buffer_size
-        public int64_t total
-        public object annotation
-        public object dups
-        public int32_t fw
-        public int64_t length
-
-    def __init__ (self, int32_t fw=0, char * anno="", int64_t buffer_size = 100000 ):
-        """fw is the fixed-width for all locations.
-
-        """
-        self.fw = fw
-        self.__locations = {}    # location pairs: two strands
-        self.__pointer = {}      # location pairs
-        self.__buf_size = {}     # location pairs
-        self.__sorted = False
-        self.total = 0           # total tags
-        self.annotation = anno   # need to be figured out
-        self.rlengths = {}       # lengths of reference sequences, e.g. each chromosome in a genome
-        self.buffer_size = buffer_size
-        self.length = 0
-        self.__destroyed = False
-
-    cpdef void destroy ( self ):
-        """Destroy this object and release mem.
-        """
-        cdef:
-            set chrs
-            bytes chromosome
-
-        chrs = self.get_chr_names()
-        for chromosome in sorted(chrs):
-            if chromosome in self.__locations:
-                self.__locations[chromosome][0].resize( self.buffer_size, refcheck=False )
-                self.__locations[chromosome][0].resize( 0, refcheck=False )
-                self.__locations[chromosome][1].resize( self.buffer_size, refcheck=False )
-                self.__locations[chromosome][1].resize( 0, refcheck=False )
-                self.__locations[chromosome] = [None, None]
-                self.__locations.pop(chromosome)
-        self.__destroyed = True
-        return
-
-    cpdef void add_loc ( self, bytes chromosome, int32_t fiveendpos, int32_t strand ):
-        """Add a location to the list according to the sequence name.
-
-        chromosome -- mostly the chromosome name
-        fiveendpos -- 5' end pos, left for plus strand, right for minus strand
-        strand     -- 0: plus, 1: minus
-        """
-        cdef:
-            int32_t i
-            int32_t b
-            np.ndarray arr
-
-        if chromosome not in self.__locations:
-            self.__buf_size[chromosome] = [ self.buffer_size, self.buffer_size ]
-            self.__locations[chromosome] = [ np.zeros(self.buffer_size, dtype='int32'), np.zeros(self.buffer_size, dtype='int32') ] # [plus,minus strand]
-            self.__pointer[chromosome] = [ 0, 0 ]
-            self.__locations[chromosome][strand][0] = fiveendpos
-            self.__pointer[chromosome][strand] = 1
-        else:
-            i = self.__pointer[chromosome][strand]
-            b = self.__buf_size[chromosome][strand]
-            arr = self.__locations[chromosome][strand]
-            if b == i:
-                b += self.buffer_size
-                arr.resize( b, refcheck = False )
-                self.__buf_size[chromosome][strand] = b
-            arr[i]= fiveendpos
-            self.__pointer[chromosome][strand] += 1
-        return
-
-    cpdef void finalize ( self ):
-        """ Resize np arrays for 5' positions and sort them in place
-
-        Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
-        """
-
-        cdef:
-            int32_t i
-            bytes c
-            set chrnames
-
-        self.total = 0
-
-        chrnames = self.get_chr_names()
-
-        for c in chrnames:
-            self.__locations[c][0].resize( self.__pointer[c][0], refcheck=False )
-            self.__locations[c][0].sort()
-            self.__locations[c][1].resize( self.__pointer[c][1], refcheck=False )
-            self.__locations[c][1].sort()
-            self.total += self.__locations[c][0].size + self.__locations[c][1].size
-
-        self.__sorted = True
-        self.length = self.fw * self.total
-        return
-
-    cpdef bint set_rlengths ( self, dict rlengths ):
-        """Set reference chromosome lengths dictionary.
-
-        Only the chromosome existing in this fwtrack object will be updated.
-
-        If chromosome in this fwtrack is not covered by given
-        rlengths, and it has no associated length, it will be set as
-        maximum integer.
-
-        """
-        cdef:
-            set valid_chroms, missed_chroms, extra_chroms
-            bytes chrom
-
-        valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys())
-        for chrom in sorted(valid_chroms):
-            self.rlengths[chrom] = rlengths[chrom]
-        missed_chroms = set(self.__locations.keys()).difference(rlengths.keys())
-        for chrom in sorted(missed_chroms):
-            self.rlengths[chrom] = INT_MAX
-        return True
-
-    cpdef dict get_rlengths ( self ):
-        """Get reference chromosome lengths dictionary.
-
-        If self.rlength is empty, create a new dict where the length of
-        chromosome will be set as the maximum integer.
-        """
-        if not self.rlengths:
-            self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
-        return self.rlengths
-
-    cpdef get_locations_by_chr ( self, bytes chromosome ):
-        """Return a tuple of two lists of locations for certain chromosome.
-
-        """
-        if chromosome in self.__locations:
-            return self.__locations[chromosome]
-        else:
-            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
-
-    cpdef set get_chr_names ( self ):
-        """Return all the chromosome names stored in this track object.
-        """
-        return set(sorted(self.__locations.keys()))
-
-    cpdef void sort ( self ):
-        """Naive sorting for locations.
-
-        """
-        cdef:
-            int32_t i
-            bytes c
-            set chrnames
-
-        chrnames = self.get_chr_names()
-
-        for c in chrnames:
-            self.__locations[c][0].sort()
-            self.__locations[c][1].sort()
-
-        self.__sorted = True
-        return
-
-    @cython.boundscheck(False) # do not check that np indices are valid
-    cpdef uint64_t filter_dup ( self, int32_t maxnum = -1):
-        """Filter the duplicated reads.
-
-        Run it right after you add all data into this object.
-
-        Note, this function will *throw out* duplicates
-        permenantly. If you want to keep them, use separate_dups
-        instead.
-        """
-        cdef:
-            int32_t p, m, n, current_loc
-            # index for old array, and index for new one
-            uint64_t i_old, i_new, size, new_size
-            bytes k
-            np.ndarray[int32_t, ndim=1] plus, new_plus, minus, new_minus
-            set chrnames
-
-        if maxnum < 0: return self.total         # do nothing
-
-        if not self.__sorted:
-            self.sort()
-
-        self.total = 0
-        self.length = 0
-
-        chrnames = self.get_chr_names()
-
-        for k in chrnames:
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-
-            # + strand
-            i_new = 0
-            plus = self.__locations[k][0]
-            size = plus.shape[0]
-            if len(plus) <= 1:
-                new_plus = plus         # do nothing
-            else:
-                new_plus = np.zeros( self.__pointer[k][0] + 1,dtype='int32' )
-                new_plus[ i_new ] = plus[ i_new ] # first item
-                i_new += 1
-                n = 1                # the number of tags in the current location
-                current_loc = plus[0]
-                for i_old in range( 1, size ):
-                    p = plus[ i_old ]
-                    if p == current_loc:
-                        n += 1
-                    else:
-                        current_loc = p
-                        n = 1
-                    if n <= maxnum:
-                        new_plus[ i_new ] = p
-                        i_new += 1
-                new_plus.resize( i_new, refcheck=False )
-                self.total +=  i_new
-                self.__pointer[k][0] = i_new
-                # free memory?
-                # I know I should shrink it to 0 size directly,
-                # however, on Mac OSX, it seems directly assigning 0
-                # doesn't do a thing.
-                plus.resize( self.buffer_size, refcheck=False )
-                plus.resize( 0, refcheck=False )
-                # hope there would be no mem leak...
-
-            # - strand
-            i_new = 0
-            minus = self.__locations[k][1]
-            size = minus.shape[0]
-            if len(minus) <= 1:
-                new_minus = minus         # do nothing
-            else:
-                new_minus = np.zeros( self.__pointer[k][1] + 1,dtype='int32' )
-                new_minus[ i_new ] = minus[ i_new ] # first item
-                i_new += 1
-                n = 1                # the number of tags in the current location
-                current_loc = minus[0]
-                for i_old in range( 1, size ):
-                    p = minus[ i_old ]
-                    if p == current_loc:
-                        n += 1
-                    else:
-                        current_loc = p
-                        n = 1
-                    if n <= maxnum:
-                        new_minus[ i_new ] = p
-                        i_new += 1
-                new_minus.resize( i_new, refcheck=False )
-                self.total +=  i_new
-                self.__pointer[k][1] = i_new
-                # free memory ?
-                # I know I should shrink it to 0 size directly,
-                # however, on Mac OSX, it seems directly assigning 0
-                # doesn't do a thing.
-                minus.resize( self.buffer_size, refcheck=False )
-                minus.resize( 0, refcheck=False )
-                # hope there would be no mem leak...
-
-            self.__locations[k]=[new_plus,new_minus]
-
-        self.length = self.fw * self.total
-        return self.total
-
-    cpdef void sample_percent (self, float32_t percent, int32_t seed = -1 ):
-        """Sample the tags for a given percentage.
-
-        Warning: the current object is changed!
-        """
-        cdef:
-            int32_t num, i_chrom      # num: number of reads allowed on a certain chromosome
-            bytes k
-            set chrnames
-
-        self.total = 0
-        self.length = 0
-
-        chrnames = self.get_chr_names()
-
-        if seed >= 0:
-            np.random.seed(seed)
-
-        for k in chrnames:
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-
-            num = <int32_t>round(self.__locations[k][0].shape[0] * percent, 5 )
-            np.random.shuffle( self.__locations[k][0] )
-            self.__locations[k][0].resize( num, refcheck=False )
-            self.__locations[k][0].sort()
-            self.__pointer[k][0] = self.__locations[k][0].shape[0]
-
-            num = <int32_t>round(self.__locations[k][1].shape[0] * percent, 5 )
-            np.random.shuffle( self.__locations[k][1] )
-            self.__locations[k][1].resize( num, refcheck=False )
-            self.__locations[k][1].sort()
-            self.__pointer[k][1] = self.__locations[k][1].shape[0]
-
-            self.total += self.__pointer[k][0] + self.__pointer[k][1]
-
-        self.length = self.fw * self.total
-        return
-
-    cpdef void sample_num (self, uint64_t samplesize, int32_t seed = -1):
-        """Sample the tags for a given percentage.
-
-        Warning: the current object is changed!
-        """
-        cdef:
-            float32_t percent
-
-        percent = <float32_t>(samplesize)/self.total
-        self.sample_percent ( percent, seed )
-        return
-
-    cpdef void print_to_bed (self, fhd=None):
-        """Output FWTrack to BED format files. If fhd is given,
-        write to a file, otherwise, output to standard output.
-
-        """
-        cdef:
-            int32_t i, i_chrom, p
-            bytes k
-            set chrnames
-
-        if not fhd:
-            fhd = sys.stdout
-        assert isinstance(fhd,io.IOBase)
-        assert self.fw > 0, "FWTrack object .fw should be set larger than 0!"
-
-        chrnames = self.get_chr_names()
-
-        for k in chrnames:
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-
-            plus = self.__locations[k][0]
-
-            for i in range(plus.shape[0]):
-                p = plus[i]
-                fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p,p+self.fw,"+") )
-
-            minus = self.__locations[k][1]
-
-            for i in range(minus.shape[0]):
-                p = minus[i]
-                fhd.write("%s\t%d\t%d\t.\t.\t%s\n" % (k.decode(),p-self.fw,p,"-") )
-        return
-
-    cpdef tuple extract_region_tags ( self, bytes chromosome, int32_t startpos, int32_t endpos ):
-        cdef:
-            int32_t i, pos
-            np.ndarray[int32_t, ndim=1] rt_plus, rt_minus
-            list temp
-            set chrnames
-
-        if not self.__sorted: self.sort()
-
-        chrnames = self.get_chr_names()
-        assert chromosome in chrnames, "chromosome %s can't be found in the FWTrack object." % chromosome
-
-        (plus, minus) = self.__locations[chromosome]
-
-        temp = []
-        for i in range(plus.shape[0]):
-            pos = plus[i]
-            if pos < startpos:
-                continue
-            elif pos > endpos:
-                break
-            else:
-                temp.append(pos)
-        rt_plus = np.array(temp)
-
-        temp = []
-        for i in range(minus.shape[0]):
-            pos = minus[i]
-            if pos < startpos:
-                continue
-            elif pos > endpos:
-                break
-            else:
-                temp.append(pos)
-        rt_minus = np.array(temp)
-        return (rt_plus, rt_minus)
-
-    cpdef list compute_region_tags_from_peaks ( self, peaks, func, int32_t window_size = 100, float32_t cutoff = 5 ):
-        """Extract tags in peak, then apply func on extracted tags.
-
-        peaks: redefined regions to extract raw tags in PeakIO type: check cPeakIO.pyx.
-
-        func:  a function to compute *something* from tags found in a predefined region
-
-        window_size: this will be passed to func.
-
-        cutoff: this will be passed to func.
-
-        func needs the fixed number of parameters, so it's not flexible. Here is an example:
-
-        wtd_find_summit(chrom, plus, minus, peak_start, peak_end, name , window_size, cutoff):
-
-        """
-
-        cdef:
-            int32_t m, i, j, pre_i, pre_j, pos, startpos, endpos
-            np.ndarray[int32_t, ndim=1] plus, minus, rt_plus, rt_minus
-            bytes chrom, name
-            list temp, retval
-            set pchrnames, chrnames
-
-        pchrnames = peaks.get_chr_names()
-        retval = []
-
-        # this object should be sorted
-        if not self.__sorted: self.sort()
-        # PeakIO object should be sorted
-        peaks.sort()
-
-        chrnames = self.get_chr_names()
-
-        for chrom in sorted(pchrnames):
-            assert chrom in chrnames, "chromosome %s can't be found in the FWTrack object." % chrom
-            (plus, minus) = self.__locations[chrom]
-            cpeaks = peaks.get_data_from_chrom(chrom)
-            prev_i = 0
-            prev_j = 0
-            for m in range(len(cpeaks)):
-                startpos = cpeaks[m]["start"] - window_size
-                endpos   = cpeaks[m]["end"] + window_size
-                name     = cpeaks[m]["name"]
-
-                temp = []
-                for i in range(prev_i,plus.shape[0]):
-                    pos = plus[i]
-                    if pos < startpos:
-                        continue
-                    elif pos > endpos:
-                        prev_i = i
-                        break
-                    else:
-                        temp.append(pos)
-                rt_plus = np.array(temp, dtype="int32")
-
-                temp = []
-                for j in range(prev_j,minus.shape[0]):
-                    pos = minus[j]
-                    if pos < startpos:
-                        continue
-                    elif pos > endpos:
-                        prev_j = j
-                        break
-                    else:
-                        temp.append(pos)
-                rt_minus = np.array(temp, dtype="int32")
-
-                retval.append( func(chrom, rt_plus, rt_minus, startpos, endpos, name = name, window_size = window_size, cutoff = cutoff) )
-                # rewind window_size
-                for i in range(prev_i, 0, -1):
-                    if plus[prev_i] - plus[i] >= window_size:
-                        break
-                prev_i = i
-
-                for j in range(prev_j, 0, -1):
-                    if minus[prev_j] - minus[j] >= window_size:
-                        break
-                prev_j = j
-                # end of a loop
-
-        return retval
-
-    cpdef list pileup_a_chromosome ( self, bytes chrom, list ds, list scale_factor_s, float32_t baseline_value = 0.0, bint directional = True, int32_t end_shift = 0 ):
-        """pileup a certain chromosome, return [p,v] (end position and value) list.
-
-        ds             : tag will be extended to this value to 3' direction,
-                         unless directional is False. Can contain multiple extension
-                         values. Final pileup will the maximum.
-        scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
-        baseline_value : a value to be filled for missing values, and will be the minimum pileup.
-        directional    : if False, the strand or direction of tag will be ignored, so that extension will be both sides with d/2.
-        end_shift      : move cutting ends towards 5->3 direction if value is positive, or towards 3->5 direction if negative. Default is 0 -- no shift at all.
-
-
-        p and v are numpy.ndarray objects.
-        """
-        cdef:
-            int64_t d
-            int64_t five_shift, three_shift  # adjustment to 5' end and 3' end positions to make a fragment
-            dict chrlengths = self.get_rlengths ()
-            int64_t rlength = chrlengths[chrom]
-            object ends
-            list five_shift_s = []
-            list three_shift_s = []
-            list tmp_pileup, prev_pileup
-
-        assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
-
-        # adjust extension length according to 'directional' and 'halfextension' setting.
-        for d in ds:
-            if directional:
-                # only extend to 3' side
-                five_shift_s.append(  - end_shift )
-                three_shift_s.append( end_shift + d)
-            else:
-                # both sides
-                five_shift_s.append( d//2 - end_shift )
-                three_shift_s.append( end_shift + d - d//2)
-
-        prev_pileup = None
-
-        for i in range(len(ds)):
-            five_shift = five_shift_s[i]
-            three_shift = three_shift_s[i]
-            scale_factor = scale_factor_s[i]
-            tmp_pileup = se_all_in_one_pileup ( self.__locations[chrom][0], self.__locations[chrom][1], five_shift, three_shift, rlength, scale_factor, baseline_value )
-
-            if prev_pileup:
-                prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
-            else:
-                prev_pileup = tmp_pileup
-
-        return prev_pileup
-
-cdef inline int32_t left_sum ( data, int32_t pos, int32_t width ):
-    """
-    """
-    return sum([data[x] for x in data if x <= pos and x >= pos - width])
-
-cdef inline int32_t right_sum ( data, int32_t pos, int32_t width ):
-    """
-    """
-    return sum([data[x] for x in data if x >= pos and x <= pos + width])
-
-cdef inline int32_t left_forward ( data, int32_t pos, int32_t window_size ):
-    return data.get(pos,0) - data.get(pos-window_size, 0)
-
-cdef inline int32_t right_forward ( data, int32_t pos, int32_t window_size ):
-    return data.get(pos + window_size, 0) - data.get(pos, 0)
-
diff --git a/MACS3/Signal/HMMR_Signal_Processing.py b/MACS3/Signal/HMMR_Signal_Processing.py
index b6185663..a6c4a8cb 100644
--- a/MACS3/Signal/HMMR_Signal_Processing.py
+++ b/MACS3/Signal/HMMR_Signal_Processing.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-04 10:25:29 Tao Liu>
+# Time-stamp: <2024-10-14 17:04:27 Tao Liu>
 
 """Module description:
 
@@ -137,7 +137,7 @@ def generate_digested_signals(petrack, weight_mapping: list) -> list:
         certain_signals = ret_digested_signals[i]
         bdg = bedGraphTrackI()
         for chrom in sorted(certain_signals.keys()):
-            bdg.add_chrom_data_hmmr_PV(chrom, certain_signals[chrom])
+            bdg.add_chrom_data_PV(chrom, certain_signals[chrom])
         ret_bedgraphs.append(bdg)
     return ret_bedgraphs
 
diff --git a/MACS3/Signal/PairedEndTrack.py b/MACS3/Signal/PairedEndTrack.py
new file mode 100644
index 00000000..72c39d91
--- /dev/null
+++ b/MACS3/Signal/PairedEndTrack.py
@@ -0,0 +1,1016 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-15 15:56:00 Tao Liu>
+
+"""Module for filter duplicate tags from paired-end data
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# Python modules
+# ------------------------------------
+import io
+import sys
+from array import array as pyarray
+from collections import Counter
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+from MACS3.Signal.Pileup import (quick_pileup,
+                                 over_two_pv_array,
+                                 se_all_in_one_pileup)
+from MACS3.Signal.BedGraph import (bedGraphTrackI,
+                                   bedGraphTrackII)
+from MACS3.Signal.PileupV2 import (pileup_from_LR_hmmratac,
+                                   pileup_from_LRC)
+# ------------------------------------
+# Other modules
+# ------------------------------------
+import cython
+import numpy as np
+import cython.cimports.numpy as cnp
+from cython.cimports.cpython import bool
+from cython.cimports.libc.stdint import INT32_MAX as INT_MAX
+
+from MACS3.Utilities.Logger import logging
+
+logger = logging.getLogger(__name__)
+debug = logger.debug
+info = logger.info
+
+# Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting
+# PE data doesn't have strandedness
+
+
+@cython.cclass
+class PETrackI:
+    """Paired End Locations Track class I along the whole genome
+    (commonly with the same annotation type), which are stored in a
+    dict.
+
+    Locations are stored and organized by sequence names (chr names) in a
+    dict. They can be sorted by calling self.sort() function.
+    """
+    locations = cython.declare(dict, visibility="public")
+    size = cython.declare(dict, visibility="public")
+    buf_size = cython.declare(dict, visibility="public")
+    is_sorted = cython.declare(bool, visibility="public")
+    total = cython.declare(cython.ulong, visibility="public")
+    annotation = cython.declare(str, visibility="public")
+    # rlengths: reference chromosome lengths dictionary
+    rlengths = cython.declare(dict, visibility="public")
+    buffer_size = cython.declare(cython.long, visibility="public")
+    length = cython.declare(cython.long, visibility="public")
+    average_template_length = cython.declare(cython.float, visibility="public")
+    is_destroyed: bool
+
+    def __init__(self, anno: str = "", buffer_size: cython.long = 100000):
+        """fw is the fixed-width for all locations.
+
+        """
+        # dictionary with chrname as key, nparray with
+        # [('l','i4'),('r','i4')] as value
+        self.locations = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        # size is to remember the size of the fragments added to this chromosome
+        self.size = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        self.buf_size = {}
+        self.is_sorted = False
+        self.total = 0           # total fragments
+        self.annotation = anno   # need to be figured out
+        self.rlengths = {}
+        self.buffer_size = buffer_size
+        self.length = 0
+        self.average_template_length = 0.0
+        self.is_destroyed = False
+
+    @cython.ccall
+    def add_loc(self, chromosome: bytes,
+                start: cython.int, end: cython.int):
+        """Add a location to the list according to the sequence name.
+
+        chromosome -- mostly the chromosome name
+        fiveendpos -- 5' end pos, left for plus strand, right for neg strand
+        """
+        i: cython.int
+
+        if chromosome not in self.locations:
+            self.buf_size[chromosome] = self.buffer_size
+            # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
+            self.locations[chromosome] = np.zeros(shape=self.buffer_size,
+                                                  dtype=[('l', 'i4'), ('r', 'i4')])
+            self.locations[chromosome][0] = (start, end)
+            self.size[chromosome] = 1
+        else:
+            i = self.size[chromosome]
+            if self.buf_size[chromosome] == i:
+                self.buf_size[chromosome] += self.buffer_size
+                self.locations[chromosome].resize((self.buf_size[chromosome]),
+                                                  refcheck=False)
+            self.locations[chromosome][i] = (start, end)
+            self.size[chromosome] = i + 1
+        self.length += end - start
+        return
+
+    @cython.ccall
+    def destroy(self):
+        """Destroy this object and release mem.
+        """
+        chrs: set
+        chromosome: bytes
+
+        chrs = self.get_chr_names()
+        for chromosome in sorted(chrs):
+            if chromosome in self.locations:
+                self.locations[chromosome].resize(self.buffer_size,
+                                                  refcheck=False)
+                self.locations[chromosome].resize(0,
+                                                  refcheck=False)
+                self.locations[chromosome] = None
+                self.locations.pop(chromosome)
+        self.is_destroyed = True
+        return
+
+    @cython.ccall
+    def set_rlengths(self, rlengths: dict) -> bool:
+        """Set reference chromosome lengths dictionary.
+
+        Only the chromosome existing in this petrack object will be updated.
+
+        If a chromosome in this petrack is not covered by given
+        rlengths, and it has no associated length, it will be set as
+        maximum integer.
+        """
+        valid_chroms: set
+        missed_chroms: set
+        chrom: bytes
+
+        valid_chroms = set(self.locations.keys()).intersection(rlengths.keys())
+        for chrom in sorted(valid_chroms):
+            self.rlengths[chrom] = rlengths[chrom]
+        missed_chroms = set(self.locations.keys()).difference(rlengths.keys())
+        for chrom in sorted(missed_chroms):
+            self.rlengths[chrom] = INT_MAX
+        return True
+
+    @cython.ccall
+    def get_rlengths(self) -> dict:
+        """Get reference chromosome lengths dictionary.
+
+        If self.rlengths is empty, create a new dict where the length of
+        chromosome will be set as the maximum integer.
+        """
+        if not self.rlengths:
+            self.rlengths = dict([(k, INT_MAX) for k in self.locations.keys()])
+        return self.rlengths
+
+    @cython.ccall
+    def finalize(self):
+        """Resize np arrays for 5' positions and sort them in place
+
+        Note: If this function is called, it's impossible to append
+        more files to this PETrackI object. So remember to call it
+        after all the files are read!
+
+        """
+        c: bytes
+        chrnames: set
+
+        self.total = 0
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.locations[c].resize((self.size[c]), refcheck=False)
+            self.locations[c].sort(order=['l', 'r'])
+            self.total += self.size[c]
+
+        self.is_sorted = True
+        self.average_template_length = cython.cast(cython.float, self.length) / self.total
+        return
+
+    @cython.ccall
+    def get_locations_by_chr(self, chromosome: bytes):
+        """Return a tuple of two lists of locations for certain chromosome.
+
+        """
+        if chromosome in self.locations:
+            return self.locations[chromosome]
+        else:
+            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names in this track object as a python set.
+        """
+        return set(self.locations.keys())
+
+    @cython.ccall
+    def sort(self):
+        """Naive sorting for locations.
+
+        """
+        c: bytes
+        chrnames: set
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.locations[c].sort(order=['l', 'r'])  # sort by the leftmost location
+        self.is_sorted = True
+        return
+
+    @cython.ccall
+    def count_fraglengths(self) -> dict:
+        """Return a dictionary of the counts for sizes/fragment
+        lengths of each pair.
+
+        This function is for HMMRATAC.
+
+        """
+        sizes: cnp.ndarray(cnp.int32_t, ndim=1)
+        s: cython.int
+        locs: cnp.ndarray
+        chrnames: list
+        i: cython.int
+
+        counter = Counter()
+        chrnames = list(self.get_chr_names())
+        for i in range(len(chrnames)):
+            locs = self.locations[chrnames[i]]
+            sizes = locs['r'] - locs['l']
+            for s in sizes:
+                counter[s] += 1
+        return dict(counter)
+
+    @cython.ccall
+    def fraglengths(self) -> cnp.ndarray:
+        """Return the sizes/fragment lengths of each pair.
+
+        This function is for HMMRATAC EM training.
+        """
+        sizes: cnp.ndarray(np.int32_t, ndim=1)
+        locs: cnp.ndarray
+        chrnames: list
+        i: cython.int
+
+        chrnames = list(self.get_chr_names())
+        locs = self.locations[chrnames[0]]
+        sizes = locs['r'] - locs['l']
+        for i in range(1, len(chrnames)):
+            locs = self.locations[chrnames[i]]
+            sizes = np.concatenate((sizes, locs['r'] - locs['l']))
+        return sizes
+
+    @cython.boundscheck(False)  # do not check that np indices are valid
+    @cython.ccall
+    def filter_dup(self, maxnum: cython.int = -1):
+        """Filter the duplicated reads.
+
+        Run it right after you add all data into this object.
+        """
+        n: cython.int
+        loc_start: cython.int
+        loc_end: cython.int
+        current_loc_start: cython.int
+        current_loc_end: cython.int
+        i: cython.ulong
+        locs_size: cython.ulong
+        k: bytes
+        locs: cnp.ndarray
+        chrnames: set
+        selected_idx: cnp.ndarray
+
+        if maxnum < 0:
+            return              # condition to return if not filtering
+
+        if not self.is_sorted:
+            self.sort()
+
+        self.total = 0
+        # self.length = 0
+        self.average_template_length = 0.0
+
+        chrnames = self.get_chr_names()
+
+        for k in chrnames:      # for each chromosome
+            locs = self.locations[k]
+            locs_size = locs.shape[0]
+            if locs_size == 1:
+                # do nothing and continue
+                continue
+            # discard duplicate reads and make a new locations[k]
+            # initialize boolean array as all TRUE, or all being kept
+            selected_idx = np.ones(locs_size, dtype=bool)
+            # get the first loc
+            (current_loc_start, current_loc_end) = locs[0]
+            i = 1               # index of new_locs
+            n = 1  # the number of tags in the current genomic location
+            for i in range(1, locs_size):
+                (loc_start, loc_end) = locs[i]
+                if loc_start != current_loc_start or loc_end != current_loc_end:
+                    # not the same, update currnet_loc_start/end/l, reset n
+                    current_loc_start = loc_start
+                    current_loc_end = loc_end
+                    n = 1
+                    continue
+                else:
+                    # both ends are the same, add 1 to duplicate number n
+                    n += 1
+                    if n > maxnum:
+                        # change the flag to False
+                        selected_idx[i] = False
+                        # subtract current_loc_l from self.length
+                        self.length -= current_loc_end - current_loc_start
+            self.locations[k] = locs[selected_idx]
+            self.size[k] = self.locations[k].shape[0]
+            self.total += self.size[k]
+            # free memory?
+            # I know I should shrink it to 0 size directly,
+            # however, on Mac OSX, it seems directly assigning 0
+            # doesn't do a thing.
+            selected_idx.resize(self.buffer_size, refcheck=False)
+            selected_idx.resize(0, refcheck=False)
+        self.average_template_length = self.length / self.total
+        return
+
+    @cython.ccall
+    def sample_percent(self, percent: cython.float, seed: cython.int = -1):
+        """Sample the tags for a given percentage.
+
+        Warning: the current object is changed! If a new PETrackI is
+        wanted, use sample_percent_copy instead.
+
+        """
+        # num: number of reads allowed on a certain chromosome
+        num: cython.uint
+        k: bytes
+        chrnames: set
+
+        self.total = 0
+        self.length = 0
+        self.average_template_length = 0.0
+
+        chrnames = self.get_chr_names()
+
+        if seed >= 0:
+            info(f"#   A random seed {seed} has been used")
+            rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
+            rs_shuffle = rs.shuffle
+        else:
+            rs_shuffle = np.random.shuffle
+
+        for k in sorted(chrnames):
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+
+            num = cython.cast(cython.uint,
+                              round(self.locations[k].shape[0] * percent, 5))
+            rs_shuffle(self.locations[k])
+            self.locations[k].resize(num, refcheck=False)
+            self.locations[k].sort(order=['l', 'r'])  # sort by leftmost positions
+            self.size[k] = self.locations[k].shape[0]
+            self.length += (self.locations[k]['r'] - self.locations[k]['l']).sum()
+            self.total += self.size[k]
+        self.average_template_length = cython.cast(cython.float, self.length)/self.total
+        return
+
+    @cython.ccall
+    def sample_percent_copy(self, percent: cython.float, seed: cython.int = -1):
+        """Sample the tags for a given percentage. Return a new PETrackI object
+
+        """
+        # num: number of reads allowed on a certain chromosome
+        num: cython.uint
+        k: bytes
+        chrnames: set
+        ret_petrackI: PETrackI
+        loc: cnp.ndarray
+
+        ret_petrackI = PETrackI(anno=self.annotation, buffer_size=self.buffer_size)
+        chrnames = self.get_chr_names()
+
+        if seed >= 0:
+            info(f"# A random seed {seed} has been used in the sampling function")
+            rs = np.random.default_rng(seed)
+        else:
+            rs = np.random.default_rng()
+
+        rs_shuffle = rs.shuffle
+
+        # chrnames need to be sorted otherwise we can't assure reproducibility
+        for k in sorted(chrnames):
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+            loc = np.copy(self.locations[k])
+            num = cython.cast(cython.uint, round(loc.shape[0] * percent, 5))
+            rs_shuffle(loc)
+            loc.resize(num, refcheck=False)
+            loc.sort(order=['l', 'r'])  # sort by leftmost positions
+            ret_petrackI.locations[k] = loc
+            ret_petrackI.size[k] = loc.shape[0]
+            ret_petrackI.length += (loc['r'] - loc['l']).sum()
+            ret_petrackI.total += ret_petrackI.size[k]
+        ret_petrackI.average_template_length = cython.cast(cython.float, ret_petrackI.length)/ret_petrackI.total
+        ret_petrackI.set_rlengths(self.get_rlengths())
+        return ret_petrackI
+
+    @cython.ccall
+    def sample_num(self, samplesize: cython.ulong, seed: cython.int = -1):
+        """Sample the tags for a given number.
+
+        Warning: the current object is changed!
+        """
+        percent: cython.float
+
+        percent = cython.cast(cython.float, samplesize)/self.total
+        self.sample_percent(percent, seed)
+        return
+
+    @cython.ccall
+    def sample_num_copy(self, samplesize: cython.ulong, seed: cython.int = -1):
+        """Sample the tags for a given number.
+
+        Warning: the current object is changed!
+        """
+        percent: cython.float
+
+        percent = cython.cast(cython.float, samplesize)/self.total
+        return self.sample_percent_copy(percent, seed)
+
+    @cython.ccall
+    def print_to_bed(self, fhd=None):
+        """Output to BEDPE format files. If fhd is given, write to a
+        file, otherwise, output to standard output.
+
+        """
+        i: cython.int
+        s: cython.int
+        e: cython.int
+        k: bytes
+        chrnames: set
+
+        if not fhd:
+            fhd = sys.stdout
+        assert isinstance(fhd, io.IOBase)
+
+        chrnames = self.get_chr_names()
+
+        for k in chrnames:
+            # for each chromosome.
+            # This loop body is too big, I may need to split code later...
+
+            locs = self.locations[k]
+
+            for i in range(locs.shape[0]):
+                s, e = locs[i]
+                fhd.write("%s\t%d\t%d\n" % (k.decode(), s, e))
+        return
+
+    @cython.ccall
+    def pileup_a_chromosome(self,
+                            chrom: bytes,
+                            scale_factor_s: list,
+                            baseline_value: cython.float = 0.0) -> list:
+        """pileup a certain chromosome, return [p,v] (end position and
+        value) list.
+
+        scale_factor_s : linearly scale the pileup value applied to
+                         each d in ds. The list should have the same
+                         length as ds.
+
+        baseline_value : a value to be filled for missing values, and
+                         will be the minimum pileup.
+
+        """
+        tmp_pileup: list
+        prev_pileup: list
+        scale_factor: cython.float
+
+        prev_pileup = None
+
+        for i in range(len(scale_factor_s)):
+            scale_factor = scale_factor_s[i]
+
+            # Can't directly pass partial nparray there since that will mess up with pointer calculation.
+            tmp_pileup = quick_pileup(np.sort(self.locations[chrom]['l']),
+                                      np.sort(self.locations[chrom]['r']),
+                                      scale_factor, baseline_value)
+
+            if prev_pileup:
+                prev_pileup = over_two_pv_array(prev_pileup,
+                                                tmp_pileup,
+                                                func="max")
+            else:
+                prev_pileup = tmp_pileup
+
+        return prev_pileup
+
+    @cython.ccall
+    def pileup_a_chromosome_c(self,
+                              chrom: bytes,
+                              ds: list,
+                              scale_factor_s: list,
+                              baseline_value: cython.float = 0.0) -> list:
+        """pileup a certain chromosome, return [p,v] (end position and
+        value) list.
+
+        This function is for control track. Basically, here is a
+        simplified function from FixWidthTrack. We pretend the PE is
+        SE data and left read is on plus strand and right read is on
+        minus strand.
+
+        ds : tag will be extended to this value to 3' direction,
+             unless directional is False. Can contain multiple
+             extension values. Final pileup will the maximum.
+        scale_factor_s : linearly scale the pileup value applied to
+                         each d in ds. The list should have the same
+                         length as ds.
+        baseline_value : a value to be filled for missing values, and
+                         will be the minimum pileup.
+        """
+        tmp_pileup: list
+        prev_pileup: list
+        scale_factor: cython.float
+        d: cython.long
+        five_shift: cython.long
+        three_shift: cython.long
+        rlength: cython.long = self.get_rlengths()[chrom]
+
+        if not self.is_sorted:
+            self.sort()
+
+        assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
+
+        prev_pileup = None
+
+        for i in range(len(scale_factor_s)):
+            d = ds[i]
+            scale_factor = scale_factor_s[i]
+            five_shift = d//2
+            three_shift = d//2
+
+            tmp_pileup = se_all_in_one_pileup(self.locations[chrom]['l'],
+                                              self.locations[chrom]['r'],
+                                              five_shift,
+                                              three_shift,
+                                              rlength,
+                                              scale_factor,
+                                              baseline_value)
+
+            if prev_pileup:
+                prev_pileup = over_two_pv_array(prev_pileup,
+                                                tmp_pileup,
+                                                func="max")
+            else:
+                prev_pileup = tmp_pileup
+
+        return prev_pileup
+
+    @cython.ccall
+    def pileup_bdg(self,
+                   scale_factor_s: list,
+                   baseline_value: cython.float = 0.0):
+        """pileup all chromosomes, and return a bedGraphTrackI object.
+
+        scale_factor_s : linearly scale the pileup value applied to
+                         each d in ds. The list should have the same
+                         length as ds.
+
+        baseline_value : a value to be filled for missing values, and
+                         will be the minimum pileup.
+
+        """
+        tmp_pileup: list
+        prev_pileup: list
+        scale_factor: cython.float
+        chrom: bytes
+        bdg: bedGraphTrackI
+
+        bdg = bedGraphTrackI(baseline_value=baseline_value)
+
+        for chrom in sorted(self.get_chr_names()):
+            prev_pileup = None
+            for i in range(len(scale_factor_s)):
+                scale_factor = scale_factor_s[i]
+
+                # Can't directly pass partial nparray there since that
+                # will mess up with pointer calculation.
+                tmp_pileup = quick_pileup(np.sort(self.locations[chrom]['l']),
+                                          np.sort(self.locations[chrom]['r']),
+                                          scale_factor,
+                                          baseline_value)
+
+                if prev_pileup:
+                    prev_pileup = over_two_pv_array(prev_pileup,
+                                                    tmp_pileup,
+                                                    func="max")
+                else:
+                    prev_pileup = tmp_pileup
+            # save to bedGraph
+            bdg.add_chrom_data(chrom,
+                               pyarray('i', prev_pileup[0]),
+                               pyarray('f', prev_pileup[1]))
+        return bdg
+
+    @cython.ccall
+    def pileup_bdg_hmmr(self,
+                        mapping: list,
+                        baseline_value: cython.float = 0.0) -> list:
+        """pileup all chromosomes, and return a list of four p-v
+        ndarray objects: short, mono, di, and tri nucleosomal signals.
+
+        This is specifically designed for hmmratac
+        HMM_SignalProcessing.py. Not a general function.
+        
+        The idea is that for each fragment length, we generate four
+        bdg using four weights from four distributions. Then we add
+        all sets of four bdgs together.
+
+        Way to generate 'mapping', based on HMMR EM means and stddevs:
+        fl_dict = petrack.count_fraglengths()
+        fl_list = list(fl_dict.keys())
+        fl_list.sort()
+        weight_mapping = generate_weight_mapping(fl_list, em_means, em_stddevs)
+
+        """
+        ret_pileup: list
+        chroms: set
+        chrom: bytes
+        i: cython.int
+
+        ret_pileup = []
+        for i in range(len(mapping)):
+            ret_pileup.append({})
+        chroms = self.get_chr_names()
+        for i in range(len(mapping)):
+            for chrom in sorted(chroms):
+                ret_pileup[i][chrom] = pileup_from_LR_hmmratac(self.locations[chrom], mapping[i])
+        return ret_pileup
+
+
+@cython.cclass
+class PETrackII:
+    """Paired-end track class for fragment files from single-cell
+    ATAC-seq experiments. We will store data of start, end, barcode,
+    and count from the fragment files.
+
+    * I choose not to inherit PETrackI because there would be a lot of
+      differences.
+
+    """
+    locations = cython.declare(dict, visibility="public")
+    # add another dict for storing barcode for each fragment we will
+    # first convert barcode into integer and remember them in the
+    # barcode_dict, which will map the rule to numerize
+    # key:bytes as value:4bytes_integer
+    barcodes = cython.declare(dict, visibility="public")
+    barcode_dict = cython.declare(dict, visibility="public")
+    # the last number for barcodes, used to map barcode to integer
+    barcode_last_n: cython.int
+
+    size = cython.declare(dict, visibility="public")
+    buf_size = cython.declare(dict, visibility="public")
+    is_sorted = cython.declare(bool, visibility="public")
+    total = cython.declare(cython.ulong, visibility="public")
+    annotation = cython.declare(str, visibility="public")
+    # rlengths: reference chromosome lengths dictionary
+    rlengths = cython.declare(dict, visibility="public")
+    buffer_size = cython.declare(cython.long, visibility="public")
+    length = cython.declare(cython.long, visibility="public")
+    average_template_length = cython.declare(cython.float, visibility="public")
+    is_destroyed: bool
+
+    def __init__(self, anno: str = "", buffer_size: cython.long = 100000):
+        # dictionary with chrname as key, nparray with
+        # [('l','i4'),('r','i4'),('c','u1')] as value
+        self.locations = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        # size is to remember the size of the fragments added to this chromosome
+        self.size = {}
+        # dictionary with chrname as key, size of the above nparray as value
+        self.buf_size = {}
+        self.is_sorted = False
+        self.total = 0           # total fragments
+        self.annotation = anno   # need to be figured out
+        self.rlengths = {}
+        self.buffer_size = buffer_size
+        self.length = 0
+        self.average_template_length = 0.0
+        self.is_destroyed = False
+
+        self.barcodes = {}
+        self.barcode_dict = {}
+        self.barcode_last_n = 0
+
+    @cython.ccall
+    def add_loc(self,
+                chromosome: bytes,
+                start: cython.int,
+                end: cython.int,
+                barcode: bytes,
+                count: cython.uchar):
+        """Add a location to the list according to the sequence name.
+
+        chromosome: mostly the chromosome name
+        start: left position of the fragment
+        end: right position of the fragment
+        barcode: the barcode of the fragment
+        count: the count of the fragment
+        """
+        i: cython.int
+        # bn: the integer in barcode_dict for this barcode
+        bn: cython.int
+
+        if barcode not in self.barcode_dict:
+            self.barcode_dict[barcode] = self.barcode_last_n
+            self.barcode_last_n += 1
+        bn = self.barcode_dict[barcode]
+
+        if chromosome not in self.locations:
+            self.buf_size[chromosome] = self.buffer_size
+            # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
+            # ['c'] is the count number of this fragment
+            self.locations[chromosome] = np.zeros(shape=self.buffer_size,
+                                                  dtype=[('l', 'i4'), ('r', 'i4'), ('c', 'u1')])
+            self.barcodes[chromosome] = np.zeros(shape=self.buffer_size,
+                                                 dtype='i4')
+            self.locations[chromosome][0] = (start, end, count)
+            self.barcodes[chromosome][0] = bn
+            self.size[chromosome] = 1
+        else:
+            i = self.size[chromosome]
+            if self.buf_size[chromosome] == i:
+                self.buf_size[chromosome] += self.buffer_size
+                self.locations[chromosome].resize((self.buf_size[chromosome]),
+                                                  refcheck=False)
+            self.locations[chromosome][i] = (start, end, count)
+            self.barcodes[chromosome][i] = bn
+            self.size[chromosome] = i + 1
+        self.length += (end - start) * count
+        return
+
+    @cython.ccall
+    def destroy(self):
+        """Destroy this object and release mem.
+        """
+        chrs: set
+        chromosome: bytes
+
+        chrs = self.get_chr_names()
+        for chromosome in sorted(chrs):
+            if chromosome in self.locations:
+                self.locations[chromosome].resize(self.buffer_size,
+                                                  refcheck=False)
+                self.locations[chromosome].resize(0,
+                                                  refcheck=False)
+                self.locations[chromosome] = None
+                self.locations.pop(chromosome)
+                self.barcodes.resize(self.buffer_size,
+                                     refcheck=False)
+                self.barcodes.resize(0,
+                                     refcheck=False)
+                self.barcodes[chromosome] = None
+                self.barcodes.pop(chromosome)
+        self.barcode_dict = {}
+        self.is_destroyed = True
+        return
+
+    @cython.ccall
+    def set_rlengths(self, rlengths: dict) -> bool:
+        """Set reference chromosome lengths dictionary.
+
+        Only the chromosome existing in this petrack object will be updated.
+
+        If a chromosome in this petrack is not covered by given
+        rlengths, and it has no associated length, it will be set as
+        maximum integer.
+        """
+        valid_chroms: set
+        missed_chroms: set
+        chrom: bytes
+
+        valid_chroms = set(self.locations.keys()).intersection(rlengths.keys())
+        for chrom in sorted(valid_chroms):
+            self.rlengths[chrom] = rlengths[chrom]
+        missed_chroms = set(self.locations.keys()).difference(rlengths.keys())
+        for chrom in sorted(missed_chroms):
+            self.rlengths[chrom] = INT_MAX
+        return True
+
+    @cython.ccall
+    def get_rlengths(self) -> dict:
+        """Get reference chromosome lengths dictionary.
+
+        If self.rlengths is empty, create a new dict where the length of
+        chromosome will be set as the maximum integer.
+        """
+        if not self.rlengths:
+            self.rlengths = dict([(k, INT_MAX) for k in self.locations.keys()])
+        return self.rlengths
+
+    @cython.ccall
+    def finalize(self):
+        """Resize np arrays for 5' positions and sort them in place
+
+        Note: If this function is called, it's impossible to append
+        more files to this PETrackII object. So remember to call it
+        after all the files are read!
+
+        """
+        c: bytes
+        chrnames: set
+        indices: cnp.ndarray
+
+        self.total = 0
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            self.locations[c].resize((self.size[c]), refcheck=False)
+            indices = np.argsort(self.locations[c], order=['l', 'r'])
+            self.locations[c] = self.locations[c][indices]
+            self.barcodes[c] = self.barcodes[c][indices]
+            self.total += np.sum(self.locations[c]['c'])  # self.size[c]
+
+        self.is_sorted = True
+        self.average_template_length = cython.cast(cython.float,
+                                                   self.length) / self.total
+        return
+
+    @cython.ccall
+    def get_locations_by_chr(self, chromosome: bytes):
+        """Return a np array of left/right/count for certain chromosome.
+
+        """
+        if chromosome in self.locations:
+            return self.locations[chromosome]
+        else:
+            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
+
+    @cython.ccall
+    def get_chr_names(self) -> set:
+        """Return all the chromosome names in this track object as a
+        python set.
+
+        """
+        return set(self.locations.keys())
+
+    @cython.ccall
+    def sort(self):
+        """Naive sorting for locations.
+
+        """
+        c: bytes
+        chrnames: set
+        indices: cnp.ndarray
+
+        chrnames = self.get_chr_names()
+
+        for c in chrnames:
+            indices = np.argsort(self.locations[c], order=['l', 'r'])
+            self.locations[c] = self.locations[c][indices]
+            self.barcodes[c] = self.barcodes[c][indices]
+        self.is_sorted = True
+        return
+
+    @cython.ccall
+    def count_fraglengths(self) -> dict:
+        """Return a dictionary of the counts for sizes/fragment
+        lengths of each pair.
+
+        This function is for HMMRATAC.
+
+        """
+        sizes: cnp.ndarray(cnp.int32_t, ndim=1)
+        s: cython.int
+        locs: cnp.ndarray
+        chrnames: list
+        i: cython.int
+
+        counter = Counter()
+        chrnames = list(self.get_chr_names())
+        for i in range(len(chrnames)):
+            locs = self.locations[chrnames[i]]
+            sizes = locs['r'] - locs['l']
+            for s in sizes:
+                counter[s] += locs['c']
+        return dict(counter)
+
+    @cython.ccall
+    def fraglengths(self) -> cnp.ndarray:
+        """Return the sizes/fragment lengths of each pair.
+
+        This function is for HMMRATAC EM training.
+        """
+        sizes: cnp.ndarray(np.int32_t, ndim=1)
+        t_sizes: cnp.ndarray(np.int32_t, ndim=1)
+        locs: cnp.ndarray
+        chrnames: list
+        i: cython.int
+
+        chrnames = list(self.get_chr_names())
+        locs = self.locations[chrnames[0]]
+        sizes = locs['r'] - locs['l']
+        sizes = [x for x, count in zip(sizes, locs['c']) for _ in range(count)]
+
+        for i in range(1, len(chrnames)):
+            locs = self.locations[chrnames[i]]
+            t_sizes = locs['r'] - locs['l']
+            t_sizes = [x for x, count in zip(t_sizes, locs['c']) for _ in range(count)]
+            sizes = np.concatenate((sizes, t_sizes))
+        return sizes
+
+    @cython.ccall
+    def subset(self, selected_barcodes: set):
+        """Make a subset of PETrackII with only the given barcodes.
+
+        Note: the selected_barcodes is a set of barcodes in python
+        bytes. For example, {b"ATCTGCTAGTCTACAT", b"ATTCTCGATGCAGTCA"}
+
+        """
+        indices: cnp.ndarray
+        chrs: set
+        selected_barcodes_filtered: list
+        selected_barcodes_n: list
+        chromosome: bytes
+        ret: PETrackII
+
+        ret = PETrackII()
+        chrs = self.get_chr_names()
+
+        # first we need to convert barcodes into integers in our
+        # barcode_dict
+        selected_barcodes_filtered = [b
+                                      for b in selected_barcodes
+                                      if b in self.barcode_dict]
+        ret.barcode_dict = {b: self.barcode_dict[b]
+                            for b in selected_barcodes_filtered}
+        selected_barcodes_n = [self.barcode_dict[b]
+                               for b in selected_barcodes_filtered]
+        ret.barcode_last_n = self.barcode_last_n
+
+        # pass some values from self to ret
+        ret.annotation = self.annotation
+        ret.is_sorted = self.is_sorted
+        ret.rlengths = self.rlengths
+        ret.buffer_size = self.buffer_size
+        ret.total = 0
+        ret.length = 0
+        ret.average_template_length = 0
+        ret.is_destroyed = True
+
+        for chromosome in sorted(chrs):
+            indices = np.where(np.isin(self.barcodes[chromosome],
+                                       list(selected_barcodes_n)))[0]
+            ret.barcodes[chromosome] = self.barcodes[chromosome][indices]
+            ret.locations[chromosome] = self.locations[chromosome][indices]
+            ret.size[chromosome] = len(ret.locations[chromosome])
+            ret.buf_size[chromosome] = ret.size[chromosome]
+            ret.total += np.sum(ret.locations[chromosome]['c'])
+            ret.length += np.sum((ret.locations[chromosome]['r'] -
+                                  ret.locations[chromosome]['l']) *
+                                 ret.locations[chromosome]['c'])
+        ret.average_template_length = ret.length / ret.total
+        return ret
+
+    @cython.ccall
+    def pileup_a_chromosome(self,
+                            chrom: bytes) -> cnp.ndarray:
+        """pileup a certain chromosome, return p-v ndarray (end
+        position and pileup value).
+        """
+        return pileup_from_LRC(self.locations[chrom])
+
+    @cython.ccall
+    def pileup_bdg(self):
+        """Pileup all chromosome and return a bdg object.
+        """
+        bdg: bedGraphTrackI
+        pv: cnp.ndarray
+
+        bdg = bedGraphTrackI()
+        for chrom in self.get_chr_names():
+            pv = pileup_from_LRC(self.locations[chrom])
+            bdg.add_chrom_data_PV(chrom, pv)
+        return bdg
+
+    @cython.ccall
+    def pileup_bdg2(self):
+        """Pileup all chromosome and return a bdg object.
+        """
+        bdg: bedGraphTrackII
+        pv: cnp.ndarray
+
+        bdg = bedGraphTrackII()
+        for chrom in self.get_chr_names():
+            pv = pileup_from_LRC(self.locations[chrom])
+            bdg.add_chrom_data(chrom, pv)
+        # bedGraphTrackII needs to be 'finalized'.
+        bdg.finalize()
+        return bdg
diff --git a/MACS3/Signal/PairedEndTrack.pyx b/MACS3/Signal/PairedEndTrack.pyx
deleted file mode 100644
index 808f5d1c..00000000
--- a/MACS3/Signal/PairedEndTrack.pyx
+++ /dev/null
@@ -1,584 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2022-09-15 17:07:26 Tao Liu>
-
-"""Module for filter duplicate tags from paired-end data
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# Python modules
-# ------------------------------------
-import io
-import sys
-from copy import copy
-from array import array as pyarray
-from collections import Counter
-
-import logging
-import MACS3.Utilities.Logger
-
-logger = logging.getLogger(__name__)
-debug   = logger.debug
-info    = logger.info
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.Utilities.Constants import *
-from MACS3.Signal.Pileup import quick_pileup, over_two_pv_array, se_all_in_one_pileup
-from MACS3.Signal.BedGraph import bedGraphTrackI
-from MACS3.Signal.PileupV2 import pileup_from_LR_hmmratac
-# ------------------------------------
-# Other modules
-# ------------------------------------
-import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-from cpython cimport bool
-cimport cython
-
-
-cdef INT_MAX = <int32_t>((<uint32_t>(-1))>>1)
-
-# We don't use the following structs anymore
-# cdef packed struct peLoc:
-#     int32_t l
-#     int32_t r
-
-# cdef class PETrackChromosome:
-#     cdef:
-#         public np.ndarray locations
-#         public uint32_t pointer
-#         public uint32_t buffer_size
-#         public uint64_t coverage
-#         public uint64_t chrlen
-#         uint32_t __buffer_increment
-#         bool __sorted
-#         bool __destroyed
-
-# Let numpy enforce PE-ness using ndarray, gives bonus speedup when sorting
-# PE data doesn't have strandedness
-
-cdef class PETrackI:
-    """Paired End Locations Track class I along the whole genome
-    (commonly with the same annotation type), which are stored in a
-    dict.
-
-    Locations are stored and organized by sequence names (chr names) in a
-    dict. They can be sorted by calling self.sort() function.
-    """
-    cdef:
-        public dict __locations
-        public dict __size
-        public dict __buf_size
-        public bool __sorted
-        public uint64_t total
-        public object annotation
-        public dict rlengths
-        public int64_t buffer_size
-        public int64_t length
-        public float32_t average_template_length
-        bool   __destroyed
-
-    def __init__ (self, char * anno="", int64_t buffer_size = 100000 ):
-        """fw is the fixed-width for all locations.
-
-        """
-        self.__locations = {}    # dictionary with chrname as key, nparray with [('l','int32'),('r','int32')] as value
-        self.__size = {}      # dictionary with chrname as key, size of the above nparray as value
-        self.__buf_size = {}      # dictionary with chrname as key, size of the above nparray as value
-        self.__sorted = False
-        self.total = 0           # total fragments
-        self.annotation = anno   # need to be figured out
-        self.rlengths = {}
-        self.buffer_size = buffer_size
-        self.length = 0
-        self.average_template_length = 0.0
-
-    cpdef void add_loc ( self, bytes chromosome, int32_t start, int32_t end):
-        """Add a location to the list according to the sequence name.
-
-        chromosome -- mostly the chromosome name
-        fiveendpos -- 5' end pos, left for plus strand, right for neg strand
-        """
-        cdef:
-            int32_t i
-
-        if chromosome not in self.__locations:
-            self.__buf_size[chromosome] = self.buffer_size
-            self.__locations[chromosome] = np.zeros(shape=self.buffer_size, dtype=[('l','int32'),('r','int32')]) # note: ['l'] is the leftmost end, ['r'] is the rightmost end of fragment.
-            self.__locations[chromosome][0] = ( start, end )
-            self.__size[chromosome] = 1
-        else:
-            i = self.__size[chromosome]
-            if self.__buf_size[chromosome] == i:
-                self.__buf_size[chromosome] += self.buffer_size
-                self.__locations[chromosome].resize((self.__buf_size[chromosome]), refcheck = False )
-            self.__locations[chromosome][ i ] = ( start, end )
-            self.__size[chromosome] = i + 1
-        self.length += end - start
-        return
-
-    cpdef void destroy ( self ):
-        """Destroy this object and release mem.
-        """
-        cdef:
-            set chrs
-            bytes chromosome
-
-        chrs = self.get_chr_names()
-        for chromosome in sorted(chrs):
-            if chromosome in self.__locations:
-                self.__locations[chromosome].resize( self.buffer_size, refcheck=False )
-                self.__locations[chromosome].resize( 0, refcheck=False )
-                self.__locations[chromosome] = None
-                self.__locations.pop(chromosome)
-        self.__destroyed = True
-        return
-
-    cpdef bint set_rlengths ( self, dict rlengths ):
-        """Set reference chromosome lengths dictionary.
-
-        Only the chromosome existing in this petrack object will be updated.
-
-        If a chromosome in this petrack is not covered by given
-        rlengths, and it has no associated length, it will be set as
-        maximum integer.
-        """
-        cdef:
-            set valid_chroms, missed_chroms
-            bytes chrom
-
-        valid_chroms = set(self.__locations.keys()).intersection(rlengths.keys())
-        for chrom in sorted(valid_chroms):
-            self.rlengths[chrom] = rlengths[chrom]
-        missed_chroms = set(self.__locations.keys()).difference(rlengths.keys())
-        for chrom in sorted(missed_chroms):
-            self.rlengths[chrom] = INT_MAX
-        return True
-
-    cpdef dict get_rlengths ( self ):
-        """Get reference chromosome lengths dictionary.
-
-        If self.rlengths is empty, create a new dict where the length of
-        chromosome will be set as the maximum integer.
-        """
-        if not self.rlengths:
-            self.rlengths = dict([(k, INT_MAX) for k in self.__locations.keys()])
-        return self.rlengths
-
-    cpdef void finalize ( self ):
-        """ Resize np arrays for 5' positions and sort them in place
-
-        Note: If this function is called, it's impossible to append more files to this FWTrack object. So remember to call it after all the files are read!
-        """
-
-        cdef:
-            int32_t i
-            bytes c
-            set chrnames
-
-        self.total = 0
-
-        chrnames = self.get_chr_names()
-
-        for c in chrnames:
-            self.__locations[c].resize((self.__size[c]), refcheck=False)
-            self.__locations[c].sort( order=['l', 'r'] )
-            self.total += self.__size[c]
-
-        self.__sorted = True
-        self.average_template_length = <float32_t>( self.length ) / self.total
-        return
-
-    cpdef get_locations_by_chr ( self, bytes chromosome ):
-        """Return a tuple of two lists of locations for certain chromosome.
-
-        """
-        if chromosome in self.__locations:
-            return self.__locations[chromosome]
-        else:
-            raise Exception("No such chromosome name (%s) in TrackI object!\n" % (chromosome))
-
-    cpdef set get_chr_names ( self ):
-        """Return all the chromosome names in this track object as a python set.
-        """
-        return set(self.__locations.keys())
-
-
-    cpdef void sort ( self ):
-        """Naive sorting for locations.
-
-        """
-        cdef:
-            uint32_t i
-            bytes c
-            set chrnames
-
-        chrnames = self.get_chr_names()
-
-        for c in chrnames:
-            #print "before", self.__locations[c][0:100]
-            self.__locations[c].sort( order=['l', 'r'] ) # sort by the leftmost location
-            #print "before", self.__locations[c][0:100]
-        self.__sorted = True
-        return
-
-    cpdef dict count_fraglengths ( self ):
-        """Return a dictionary of the counts for sizes/fragment lengths of each pair.
-
-        This function is for HMMRATAC.
-        """
-        cdef:
-            np.ndarray[np.int32_t, ndim=1] sizes
-            np.int32_t s
-            np.ndarray locs
-            list chrnames
-            int i
-            #dict ret_dict
-            bytes k
-
-        counter = Counter()
-        chrnames = list( self.get_chr_names() )
-        for i in range( len(chrnames) ):
-            locs = self.__locations[ chrnames[i] ]
-            sizes = locs['r'] - locs['l']
-            for s in sizes:
-                counter[ s ] += 1
-        return dict(counter)
-
-    cpdef np.ndarray fraglengths ( self ):
-        """Return the sizes/fragment lengths of each pair.
-
-        This function is for HMMRATAC EM training.
-        """
-        cdef:
-            np.ndarray[np.int32_t, ndim=1] sizes
-            np.ndarray locs
-            list chrnames
-            int i
-
-        chrnames = list( self.get_chr_names() )
-        locs = self.__locations[ chrnames[ 0 ] ]
-        sizes = locs['r'] - locs['l']
-        for i in range( 1, len(chrnames) ):
-            locs = self.__locations[ chrnames[i] ]
-            sizes = np.concatenate( ( sizes, locs['r'] - locs['l'] ) )
-        return sizes    
-    
-    @cython.boundscheck(False) # do not check that np indices are valid
-    cpdef void filter_dup ( self, int32_t maxnum=-1):
-        """Filter the duplicated reads.
-
-        Run it right after you add all data into this object.
-        """
-        cdef:
-            int32_t i_chrom, n, start, end
-            int32_t loc_start, loc_end, current_loc_start, current_loc_end
-            uint64_t i
-            bytes k
-            np.ndarray locs
-            uint64_t locs_size
-            set chrnames
-            np.ndarray selected_idx
-
-        if maxnum < 0: return # condition to return if not filtering
-
-        if not self.__sorted: self.sort()
-
-        self.total = 0
-        #self.length = 0
-        self.average_template_length = 0.0
-        
-        chrnames = self.get_chr_names()
-
-        for k in chrnames: # for each chromosome
-            locs = self.__locations[k]
-            locs_size = locs.shape[0]
-            if locs_size == 1:
-                # do nothing and continue
-                continue
-            # discard duplicate reads and make a new __locations[k]
-            # initialize boolean array as all TRUE, or all being kept
-            selected_idx = np.ones( locs_size, dtype=bool)
-            # get the first loc
-            ( current_loc_start, current_loc_end ) = locs[0]
-            i = 1 # index of new_locs
-            n = 1 # the number of tags in the current genomic location
-            for i in range(1, locs_size):
-                ( loc_start, loc_end ) = locs[i]
-                if loc_start != current_loc_start or loc_end != current_loc_end:
-                    # not the same, update currnet_loc_start/end/l, reset n
-                    current_loc_start = loc_start
-                    current_loc_end = loc_end
-                    n = 1
-                    continue
-                else:
-                    # both ends are the same, add 1 to duplicate number n
-                    n += 1
-                    if n > maxnum:
-                        # change the flag to False
-                        selected_idx[ i ] = False
-                        # subtract current_loc_l from self.length
-                        self.length -= current_loc_end - current_loc_start
-            self.__locations[k] = locs[ selected_idx ]
-            self.__size[k] = self.__locations[k].shape[0]
-            self.total += self.__size[k]
-            # free memory?
-            # I know I should shrink it to 0 size directly,
-            # however, on Mac OSX, it seems directly assigning 0
-            # doesn't do a thing.
-            selected_idx.resize( self.buffer_size, refcheck=False)
-            selected_idx.resize( 0, refcheck=False)
-        self.average_template_length = self.length / self.total
-        return
-
-    cpdef void sample_percent (self, float32_t percent, int32_t seed = -1):
-        """Sample the tags for a given percentage.
-
-        Warning: the current object is changed! If a new PETrackI is wanted, use sample_percent_copy instead.
-        """
-        cdef:
-            uint32_t num, i_chrom      # num: number of reads allowed on a certain chromosome
-            bytes k
-            set chrnames
-            object rs, rs_shuffle
-
-        self.total = 0
-        self.length = 0
-        self.average_template_length = 0.0
-
-        chrnames = self.get_chr_names()
-
-        if seed >= 0:
-            info(f"#   A random seed {seed} has been used")
-            rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))
-            rs_shuffle = rs.shuffle
-        else:
-            rs_shuffle = np.random.shuffle
-
-        for k in sorted(chrnames):
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-
-            num = <uint32_t>round(self.__locations[k].shape[0] * percent, 5 )
-            rs_shuffle( self.__locations[k] )
-            self.__locations[k].resize( num, refcheck = False )
-            self.__locations[k].sort( order = ['l', 'r'] ) # sort by leftmost positions
-            self.__size[k] = self.__locations[k].shape[0]
-            self.length += ( self.__locations[k]['r'] - self.__locations[k]['l'] ).sum()
-            self.total += self.__size[k]
-        self.average_template_length = <float32_t>( self.length )/ self.total
-        return
-
-    cpdef object sample_percent_copy (self, float32_t percent, int32_t seed = -1):
-        """Sample the tags for a given percentage. Return a new PETrackI object
-
-        """
-        cdef:
-            uint32_t num, i_chrom      # num: number of reads allowed on a certain chromosome
-            bytes k
-            set chrnames
-            object ret_petrackI, rs, rs_shuffle
-            np.ndarray l
-
-        ret_petrackI = PETrackI( anno=self.annotation, buffer_size = self.buffer_size)
-        chrnames = self.get_chr_names()
-
-        if seed >= 0:
-            info(f"# A random seed {seed} has been used in the sampling function")
-            rs = np.random.default_rng(seed)
-        else:
-            rs = np.random.default_rng()
-
-        rs_shuffle = rs.shuffle
-        for k in sorted(chrnames): # chrnames need to be sorted otherwise we can't assure reproducibility
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-            l = np.copy( self.__locations[k] )
-            num = <uint32_t>round(l.shape[0] * percent, 5 )
-            rs_shuffle( l )
-            l.resize( num, refcheck = False )
-            l.sort( order = ['l', 'r'] ) # sort by leftmost positions
-            ret_petrackI.__locations[ k ] = l
-            ret_petrackI.__size[ k ] = l.shape[0]
-            ret_petrackI.length += ( l['r'] - l['l'] ).sum()
-            ret_petrackI.total += ret_petrackI.__size[ k ]
-        ret_petrackI.average_template_length = <float32_t>( ret_petrackI.length )/ ret_petrackI.total
-        ret_petrackI.set_rlengths( self.get_rlengths() )
-        return ret_petrackI
-
-    cpdef void sample_num (self, uint64_t samplesize, int32_t seed = -1):
-        """Sample the tags for a given number.
-
-        Warning: the current object is changed!
-        """
-        cdef:
-            float32_t percent
-        percent = <float32_t>(samplesize)/self.total
-        self.sample_percent ( percent, seed )
-        return
-
-    cpdef object sample_num_copy (self, uint64_t samplesize, int32_t seed = -1):
-        """Sample the tags for a given number.
-
-        Warning: the current object is changed!
-        """
-        cdef:
-            float32_t percent
-        percent = <float32_t>(samplesize)/self.total
-        return self.sample_percent_copy ( percent, seed )
-
-    cpdef void print_to_bed (self, fhd=None):
-        """Output to BEDPE format files. If fhd is given, write to a
-        file, otherwise, output to standard output.
-
-        """
-        cdef:
-            int32_t i, i_chrom, s, e
-            bytes k
-            set chrnames
-
-
-        if not fhd:
-            fhd = sys.stdout
-        assert isinstance(fhd, io.IOBase)
-
-        chrnames = self.get_chr_names()
-
-        for k in chrnames:
-            # for each chromosome.
-            # This loop body is too big, I may need to split code later...
-
-            locs = self.__locations[k]
-
-            for i in range(locs.shape[0]):
-                s, e = locs[ i ]
-                fhd.write("%s\t%d\t%d\n" % (k.decode(), s, e))
-        return
-
-    cpdef list pileup_a_chromosome ( self, bytes chrom, list scale_factor_s, float32_t baseline_value = 0.0 ):
-        """pileup a certain chromosome, return [p,v] (end position and value) list.
-
-        scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
-        baseline_value : a value to be filled for missing values, and will be the minimum pileup.
-        """
-        cdef:
-            list tmp_pileup, prev_pileup
-            float32_t scale_factor
-
-        prev_pileup = None
-
-        for i in range(len(scale_factor_s)):
-            scale_factor = scale_factor_s[i]
-
-            tmp_pileup = quick_pileup ( np.sort(self.__locations[chrom]['l']), np.sort(self.__locations[chrom]['r']), scale_factor, baseline_value ) # Can't directly pass partial nparray there since that will mess up with pointer calculation.
-
-            if prev_pileup:
-                prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
-            else:
-                prev_pileup = tmp_pileup
-
-        return prev_pileup
-
-    cpdef list pileup_a_chromosome_c ( self, bytes chrom, list ds, list scale_factor_s, float32_t baseline_value = 0.0 ):
-        """pileup a certain chromosome, return [p,v] (end position and value) list.
-
-        This function is for control track. Basically, here is a
-        simplified function from FixWidthTrack. We pretend the PE is
-        SE data and left read is on plus strand and right read is on
-        minus strand.
-
-        ds             : tag will be extended to this value to 3' direction,
-                         unless directional is False. Can contain multiple extension
-                         values. Final pileup will the maximum.
-        scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
-        baseline_value : a value to be filled for missing values, and will be the minimum pileup.
-        """
-        cdef:
-            list tmp_pileup, prev_pileup
-            float32_t scale_factor
-            int64_t d, five_shift, three_shift
-            int64_t rlength = self.get_rlengths()[chrom]
-
-        if not self.__sorted: self.sort()
-
-        assert len(ds) == len(scale_factor_s), "ds and scale_factor_s must have the same length!"
-
-        prev_pileup = None
-
-        for i in range(len(scale_factor_s)):
-            d = ds[i]
-            scale_factor = scale_factor_s[i]
-            five_shift = d//2
-            three_shift= d//2
-
-            tmp_pileup = se_all_in_one_pileup ( self.__locations[chrom]['l'], self.__locations[chrom]['r'], five_shift, three_shift, rlength, scale_factor, baseline_value )
-
-            if prev_pileup:
-                prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
-            else:
-                prev_pileup = tmp_pileup
-
-        return prev_pileup
-
-
-    cpdef object pileup_bdg ( self, list scale_factor_s, float32_t baseline_value = 0.0 ):
-        """pileup all chromosomes, and return a bedGraphTrackI object.
-
-        scale_factor_s  : linearly scale the pileup value applied to each d in ds. The list should have the same length as ds.
-        baseline_value : a value to be filled for missing values, and will be the minimum pileup.
-        """
-        cdef:
-            list tmp_pileup, prev_pileup
-            float32_t scale_factor
-            bytes chrom
-            object bdg
-            int32_t prev_s
-
-        #info(f"start to pileup")
-        bdg = bedGraphTrackI( baseline_value = baseline_value )
-
-        for chrom in sorted(self.get_chr_names()):
-            prev_pileup = None
-            for i in range(len(scale_factor_s)):
-                scale_factor = scale_factor_s[i]
-
-                tmp_pileup = quick_pileup ( np.sort(self.__locations[chrom]['l']), np.sort(self.__locations[chrom]['r']), scale_factor, baseline_value ) # Can't directly pass partial nparray there since that will mess up with pointer calculation.
-
-                if prev_pileup:
-                    prev_pileup = over_two_pv_array ( prev_pileup, tmp_pileup, func="max" )
-                else:
-                    prev_pileup = tmp_pileup
-            # save to bedGraph
-            bdg.add_chrom_data( chrom, pyarray('i', prev_pileup[0]), pyarray('f', prev_pileup[1]) )
-        return bdg
-
-    cpdef list pileup_bdg_hmmr ( self, list mapping, float32_t baseline_value = 0.0 ):
-        """pileup all chromosomes, and return a list of four bedGraphTrackI objects: short, mono, di, and tri nucleosomal signals.
-
-        The idea is that for each fragment length, we generate four bdg using four weights from four distributions. Then we add all sets of four bdgs together.
-
-        Way to generate 'mapping', based on HMMR EM means and stddevs:
-        fl_dict = petrack.count_fraglengths()
-        fl_list = list(fl_dict.keys())
-        fl_list.sort()
-        weight_mapping = generate_weight_mapping( fl_list, em_means, em_stddevs )
-        """
-        cdef:
-            list ret_pileup
-            set chroms
-            bytes chrom
-            int i
-
-        ret_pileup = []
-        for i in range( len(mapping) ): ret_pileup.append( {} )
-        chroms = self.get_chr_names()
-        for i in range( len(mapping) ):
-            for chrom in sorted(chroms):
-                ret_pileup[ i ][ chrom ] = pileup_from_LR_hmmratac( self.__locations[ chrom ], mapping[ i ] )
-        return ret_pileup
-
diff --git a/MACS3/Signal/PeakDetect.py b/MACS3/Signal/PeakDetect.py
new file mode 100644
index 00000000..cea6f442
--- /dev/null
+++ b/MACS3/Signal/PeakDetect.py
@@ -0,0 +1,412 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-15 10:38:40 Tao Liu>
+
+"""Module Description: Detect peaks, main module
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+# ------------------------------------
+# Python modules
+# ------------------------------------
+import cython
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+# from MACS3.Utilities.Constants import *
+from MACS3.Signal.CallPeakUnit import CallerFromAlignments
+
+
+@cython.cfunc
+def subpeak_letters(i: cython.short) -> bytes:
+    if i < 26:
+        return chr(97+i).encode()
+    else:
+        return subpeak_letters(i // 26) + chr(97 + (i % 26)).encode()
+
+
+class PeakDetect:
+    """Class to do the peak calling.
+
+    e.g
+    >>> from MACS3.cPeakDetect import cPeakDetect
+    >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000)
+    >>> pd.call_peaks()
+    """
+    def __init__(self,
+                 opt=None,
+                 treat=None,
+                 control=None,
+                 d=None,
+                 maxgap=None,
+                 minlen=None,
+                 slocal=None,
+                 llocal=None):
+        """Initialize the PeakDetect object.
+
+        """
+        self.opt = opt
+        self.info = opt.info
+        self.debug = opt.debug
+        self.warn = opt.warn
+
+        self.treat = treat
+        self.control = control
+        self.ratio_treat2control = None
+        self.peaks = None
+        self.final_peaks = None
+        self.PE_MODE = opt.PE_MODE
+        self.scoretrack = None
+
+        # self.femax = opt.femax
+        # self.femin = opt.femin
+        # self.festep = opt.festep
+
+        self.log_pvalue = opt.log_pvalue    # -log10pvalue
+        self.log_qvalue = opt.log_qvalue    # -log10qvalue
+        if d is not None:
+            self.d = d
+        else:
+            self.d = self.opt.d
+
+        if opt.maxgap:
+            self.maxgap = opt.maxgap
+        else:
+            self.maxgap = opt.tsize
+
+        if opt.minlen:
+            self.minlen = opt.minlen
+        else:
+            self.minlen = self.d
+
+        self.end_shift = self.opt.shift
+        self.gsize = opt.gsize
+
+        self.nolambda = opt.nolambda
+
+        if slocal is not None:
+            self.sregion = slocal
+        else:
+            self.sregion = opt.smalllocal
+
+        if llocal is not None:
+            self.lregion = llocal
+        else:
+            self.lregion = opt.largelocal
+
+        if (self.nolambda):
+            self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!")
+        # self.diag = opt.diag
+        # self.save_score = opt.store_score
+        # self.zwig_tr = opt.zwig_tr
+        # self.zwig_ctl= opt.zwig_ctl
+
+    def call_peaks(self):
+        """Call peaks function.
+
+        Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
+        self.final_peaks and self.final_negative_peaks.
+        """
+        if self.control:                # w/ control
+            # if self.opt.broad:
+            #    (self.peaks,self.broadpeaks) = self.__call_peaks_w_control()
+            # else:
+            self.peaks = self.__call_peaks_w_control()
+        else:                           # w/o control
+            # if self.opt.broad:
+            #    (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control()
+            # else:
+            self.peaks = self.__call_peaks_wo_control()
+        return self.peaks
+
+    def __call_peaks_w_control(self):
+        """To call peaks with control data.
+
+        A peak info type is a: dictionary
+
+        key value: chromosome
+
+        items: (peak start,peak end, peak length, peak summit, peak
+        height, number of tags in peak region, peak pvalue, peak
+        fold_enrichment) <-- tuple type
+
+        While calculating pvalue:
+
+        First, t and c will be adjusted by the ratio between total
+        reads in treatment and total reads in control, depending on
+        --to-small option.
+
+        Then, t and c will be multiplied by the smallest peak size --
+        self.d.
+
+        Finally, a poisson CDF is applied to calculate one-side pvalue
+        for enrichment.
+        """
+        lambda_bg: cython.float
+        treat_scale: cython.float
+        d: cython.float
+        ctrl_scale_s: list
+        ctrl_d_s: list
+        control_total: cython.long
+        # approx sum of treatment pileup values
+        treat_sum: cython.long
+        # approx sum of control pileup values
+        control_sum: cython.long
+
+        if self.PE_MODE:
+            d = self.treat.average_template_length
+            # in PE mode, entire fragment is counted as 1 in treatment
+            # whereas both ends of fragment are counted in
+            # control/input.
+            control_total = self.control.total * 2
+            treat_sum = self.treat.length
+            control_sum = control_total * self.treat.average_template_length
+            self.ratio_treat2control = float(treat_sum)/control_sum
+        else:
+            d = self.d
+            control_total = self.control.total
+            treat_sum = self.treat.total * self.d
+            control_sum = self.control.total * self.d
+            self.ratio_treat2control = float(treat_sum)/control_sum
+
+        if self.opt.ratio != 1.0:
+            self.ratio_treat2control = self.opt.ratio
+
+        if self.opt.tocontrol:
+            # if MACS decides to scale treatment to control data
+            # because treatment is bigger
+            lambda_bg = float(control_sum) / self.gsize
+            treat_scale = 1/self.ratio_treat2control
+        else:
+            # if MACS decides to scale control to treatment because
+            # control sample is bigger
+            lambda_bg = float(treat_sum) / self.gsize
+            treat_scale = 1.0
+
+        # prepare d_s for control data
+        if self.sregion:
+            assert self.d <= self.sregion, f"{self.sregion:} can't be smaller than {self.d:}!"
+        if self.lregion:
+            assert self.d <= self.lregion, f"{self.lregion:} can't be smaller than {self.d:}!"
+            assert self.sregion <= self.lregion, f"{self.lregion:} can't be smaller than {self.sregion:}!"
+
+        # Now prepare a list of extension sizes
+        ctrl_d_s = [self.d]   # note, d doesn't make sense in PE mode.
+        # And a list of scaling factors for control
+        ctrl_scale_s = []
+
+        # d
+        if not self.opt.tocontrol:
+            # if user wants to scale everything to ChIP data
+            tmp_v = self.ratio_treat2control
+        else:
+            tmp_v = 1.0
+        ctrl_scale_s.append(tmp_v)
+
+        # slocal size local
+        if self.sregion:
+            ctrl_d_s.append(self.sregion)
+            if not self.opt.tocontrol:
+                # if user want to scale everything to ChIP data
+                tmp_v = float(self.d)/self.sregion*self.ratio_treat2control
+            else:
+                tmp_v = float(self.d)/self.sregion
+            ctrl_scale_s.append(tmp_v)
+
+        # llocal size local
+        if self.lregion and self.lregion > self.sregion:
+            ctrl_d_s.append(self.lregion)
+            if not self.opt.tocontrol:
+                # if user want to scale everything to ChIP data
+                tmp_v = float(self.d)/self.lregion*self.ratio_treat2control
+            else:
+                tmp_v = float(self.d)/self.lregion
+            ctrl_scale_s.append(tmp_v)
+
+        # if self.PE_MODE:        # first d/scale are useless in PE mode
+        #    ctrl_d_s = ctrl_d_s[1:]
+        #    ctrl_scale_s = ctrl_scale_s[1:]
+        #    print ctrl_d_s
+        #    print ctrl_scale_s
+        if self.nolambda:
+            ctrl_d_s = []
+            ctrl_scale_s = []
+
+        scorecalculator = CallerFromAlignments(self.treat, self.control,
+                                               d=d, ctrl_d_s=ctrl_d_s,
+                                               treat_scaling_factor=treat_scale,
+                                               ctrl_scaling_factor_s=ctrl_scale_s,
+                                               end_shift=self.end_shift,
+                                               lambda_bg=lambda_bg,
+                                               save_bedGraph=self.opt.store_bdg,
+                                               bedGraph_filename_prefix=self.opt.name,
+                                               bedGraph_treat_filename=self.opt.bdg_treat,
+                                               bedGraph_control_filename=self.opt.bdg_control,
+                                               save_SPMR=self.opt.do_SPMR,
+                                               cutoff_analysis_filename=self.opt.cutoff_analysis_file )
+
+        if self.opt.trackline:
+            scorecalculator.enable_trackline()
+
+        # call peaks
+        call_summits = self.opt.call_summits
+        if call_summits:
+            self.info("#3 Going to call summits inside each peak ...")
+
+        if self.log_pvalue is not None:
+            if self.opt.broad:
+                self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." %
+                          (self.log_pvalue, self.opt.log_broadcutoff))
+                peaks = scorecalculator.call_broadpeaks(['p',],
+                                                        lvl1_cutoff_s=[self.log_pvalue,],
+                                                        lvl2_cutoff_s=[self.opt.log_broadcutoff,],
+                                                        min_length=self.minlen,
+                                                        lvl1_max_gap=self.maxgap,
+                                                        lvl2_max_gap=self.maxgap*4,
+                                                        cutoff_analysis=self.opt.cutoff_analysis)
+            else:
+                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
+                peaks = scorecalculator.call_peaks(['p',], [self.log_pvalue,],
+                                                   min_length=self.minlen,
+                                                   max_gap=self.maxgap,
+                                                   call_summits=call_summits,
+                                                   cutoff_analysis=self.opt.cutoff_analysis)
+        elif self.log_qvalue is not None:
+            if self.opt.broad:
+                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." %
+                          (self.log_qvalue, self.opt.log_broadcutoff))
+                peaks = scorecalculator.call_broadpeaks(['q',],
+                                                        lvl1_cutoff_s=[self.log_qvalue,],
+                                                        lvl2_cutoff_s=[self.opt.log_broadcutoff,],
+                                                        min_length=self.minlen,
+                                                        lvl1_max_gap=self.maxgap,
+                                                        lvl2_max_gap=self.maxgap*4,
+                                                        cutoff_analysis=self.opt.cutoff_analysis)
+            else:
+                peaks = scorecalculator.call_peaks(['q',], [self.log_qvalue,],
+                                                   min_length=self.minlen,
+                                                   max_gap=self.maxgap,
+                                                   call_summits=call_summits,
+                                                   cutoff_analysis=self.opt.cutoff_analysis)
+        scorecalculator.destroy()
+        return peaks
+
+    def __call_peaks_wo_control(self):
+        """To call peaks without control data.
+
+        A peak info type is a: dictionary
+
+        key value: chromosome
+
+        items: (peak start,peak end, peak length, peak summit, peak
+        height, number of tags in peak region, peak pvalue, peak
+        fold_enrichment) <-- tuple type
+
+        While calculating pvalue:
+
+        First, t and c will be adjusted by the ratio between total
+        reads in treatment and total reads in control, depending on
+        --to-small option.
+
+        Then, t and c will be multiplied by the smallest peak size --
+        self.d.
+
+        Finally, a poisson CDF is applied to calculate one-side pvalue
+        for enrichment.
+        """
+        lambda_bg: cython.float
+        treat_scale: cython.float = 1
+        d: cython.float
+        ctrl_scale_s: list
+        ctrl_d_s: list
+
+        if self.PE_MODE:
+            d = 0
+        else:
+            d = self.d
+        treat_length = self.treat.length
+        treat_total = self.treat.total
+
+        # global lambda
+        if self.PE_MODE:
+        # this an estimator, we should maybe test it for accuracy?
+            lambda_bg = treat_length / self.gsize
+        else:
+            lambda_bg = float(d) * treat_total / self.gsize
+        treat_scale = 1.0
+
+        # slocal and d-size local bias are not calculated!
+        # nothing done here. should this match w control??
+
+        if not self.nolambda:
+            if self.PE_MODE:
+                ctrl_scale_s = [float(treat_length) / (self.lregion*treat_total*2),]
+            else:
+                ctrl_scale_s = [float(self.d) / self.lregion,]
+            ctrl_d_s = [self.lregion,]
+        else:
+            ctrl_scale_s = []
+            ctrl_d_s = []
+
+        scorecalculator = CallerFromAlignments(self.treat, None,
+                                               d=d,
+                                               ctrl_d_s=ctrl_d_s,
+                                               treat_scaling_factor=treat_scale,
+                                               ctrl_scaling_factor_s=ctrl_scale_s,
+                                               end_shift=self.end_shift,
+                                               lambda_bg=lambda_bg,
+                                               save_bedGraph=self.opt.store_bdg,
+                                               bedGraph_filename_prefix=self.opt.name,
+                                               bedGraph_treat_filename=self.opt.bdg_treat,
+                                               bedGraph_control_filename=self.opt.bdg_control,
+                                               save_SPMR=self.opt.do_SPMR,
+                                               cutoff_analysis_filename=self.opt.cutoff_analysis_file)
+
+        if self.opt.trackline:
+            scorecalculator.enable_trackline()
+
+        # call peaks
+        call_summits = self.opt.call_summits
+        if call_summits:
+            self.info("#3 Going to call summits inside each peak ...")
+
+        if self.log_pvalue is not None:
+            if self.opt.broad:
+                self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." %
+                          (self.log_pvalue, self.opt.log_broadcutoff))
+                peaks = scorecalculator.call_broadpeaks(['p',],
+                                                        lvl1_cutoff_s=[self.log_pvalue,],
+                                                        lvl2_cutoff_s=[self.opt.log_broadcutoff,],
+                                                        min_length=self.minlen,
+                                                        lvl1_max_gap=self.maxgap,
+                                                        lvl2_max_gap=self.maxgap*4,
+                                                        cutoff_analysis=self.opt.cutoff_analysis)
+            else:
+                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
+                peaks = scorecalculator.call_peaks(['p',], [self.log_pvalue,],
+                                                   min_length=self.minlen,
+                                                   max_gap=self.maxgap,
+                                                   call_summits=call_summits,
+                                                   cutoff_analysis=self.opt.cutoff_analysis)
+        elif self.log_qvalue is not None:
+            if self.opt.broad:
+                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." %
+                          (self.log_qvalue, self.opt.log_broadcutoff))
+                peaks = scorecalculator.call_broadpeaks(['q',],
+                                                        lvl1_cutoff_s=[self.log_qvalue,],
+                                                        lvl2_cutoff_s=[self.opt.log_broadcutoff,],
+                                                        min_length=self.minlen,
+                                                        lvl1_max_gap=self.maxgap,
+                                                        lvl2_max_gap=self.maxgap*4,
+                                                        cutoff_analysis=self.opt.cutoff_analysis)
+            else:
+                peaks = scorecalculator.call_peaks(['q',], [self.log_qvalue,],
+                                                   min_length=self.minlen,
+                                                   max_gap=self.maxgap,
+                                                   call_summits=call_summits,
+                                                   cutoff_analysis=self.opt.cutoff_analysis)
+        scorecalculator.destroy()
+        return peaks
diff --git a/MACS3/Signal/PeakDetect.pyx b/MACS3/Signal/PeakDetect.pyx
deleted file mode 100644
index 64372fe6..00000000
--- a/MACS3/Signal/PeakDetect.pyx
+++ /dev/null
@@ -1,396 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2020-11-24 17:39:12 Tao Liu>
-
-"""Module Description: Detect peaks, main module
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-# ------------------------------------
-# Python modules
-# ------------------------------------
-from itertools import groupby
-from operator import itemgetter
-import io
-import gc                               # use garbage collectior
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.IO.PeakIO import PeakIO
-from MACS3.IO.BedGraphIO import bedGraphIO
-from MACS3.Utilities.Constants import *
-from MACS3.Signal.CallPeakUnit import CallerFromAlignments
-
-cdef bytes subpeak_letters(short i):
-    if i < 26:
-        return chr(97+i).encode()
-    else:
-        return subpeak_letters(i // 26) + chr(97 + (i % 26)).encode()
-
-class PeakDetect:
-    """Class to do the peak calling.
-
-    e.g
-    >>> from MACS3.cPeakDetect import cPeakDetect
-    >>> pd = PeakDetect(treat=treatdata, control=controldata, pvalue=pvalue_cutoff, d=100, gsize=3000000000)
-    >>> pd.call_peaks()
-    """
-    def __init__ (self,opt = None,treat = None, control = None, d = None,
-                  maxgap = None, minlen = None, slocal = None, llocal = None):
-        """Initialize the PeakDetect object.
-
-        """
-        self.opt = opt
-        self.info = opt.info
-        self.debug = opt.debug
-        self.warn = opt.warn
-
-        self.treat = treat
-        self.control = control
-        self.ratio_treat2control = None
-        self.peaks = None
-        self.final_peaks = None
-        self.PE_MODE = opt.PE_MODE
-        self.scoretrack = None
-
-        #self.femax = opt.femax
-        #self.femin = opt.femin
-        #self.festep = opt.festep
-
-        self.log_pvalue = opt.log_pvalue    # -log10pvalue
-        self.log_qvalue = opt.log_qvalue    # -log10qvalue
-        if d != None:
-            self.d = d
-        else:
-            self.d = self.opt.d
-
-        if opt.maxgap:
-            self.maxgap = opt.maxgap
-        else:
-            self.maxgap = opt.tsize
-
-        if opt.minlen:
-            self.minlen = opt.minlen
-        else:
-            self.minlen = self.d
-
-        self.end_shift = self.opt.shift
-        self.gsize = opt.gsize
-
-        self.nolambda = opt.nolambda
-
-        if slocal != None:
-            self.sregion = slocal
-        else:
-            self.sregion = opt.smalllocal
-
-        if llocal != None:
-            self.lregion = llocal
-        else:
-            self.lregion = opt.largelocal
-
-        if (self.nolambda):
-            self.info("#3 !!!! DYNAMIC LAMBDA IS DISABLED !!!!")
-        #self.diag = opt.diag
-        #self.save_score = opt.store_score
-        #self.zwig_tr = opt.zwig_tr
-        #self.zwig_ctl= opt.zwig_ctl
-
-    def call_peaks (self):
-        """Call peaks function.
-
-        Scan the whole genome for peaks. RESULTS WILL BE SAVED IN
-        self.final_peaks and self.final_negative_peaks.
-        """
-        if self.control:                # w/ control
-            #if self.opt.broad:
-            #    (self.peaks,self.broadpeaks) = self.__call_peaks_w_control()
-            #else:
-            self.peaks = self.__call_peaks_w_control ()
-        else:                           # w/o control
-            #if self.opt.broad:
-            #    (self.peaks,self.broadpeaks) = self.__call_peaks_wo_control()
-            #else:
-            self.peaks = self.__call_peaks_wo_control ()
-        return self.peaks
-
-    def __call_peaks_w_control (self):
-        """To call peaks with control data.
-
-        A peak info type is a: dictionary
-
-        key value: chromosome
-
-        items: (peak start,peak end, peak length, peak summit, peak
-        height, number of tags in peak region, peak pvalue, peak
-        fold_enrichment) <-- tuple type
-
-        While calculating pvalue:
-
-        First, t and c will be adjusted by the ratio between total
-        reads in treatment and total reads in control, depending on
-        --to-small option.
-
-        Then, t and c will be multiplied by the smallest peak size --
-        self.d.
-
-        Finally, a poisson CDF is applied to calculate one-side pvalue
-        for enrichment.
-        """
-        cdef:
-            int i
-            float lambda_bg, effective_depth_in_million
-            float treat_scale, d
-            list ctrl_scale_s, ctrl_d_s
-            long treat_total, control_total
-            long treat_sum              # approx sum of treatment pileup values
-            long control_sum            # approx sum of control pileup values
-
-        treat_total   = self.treat.total
-
-        if self.PE_MODE:
-            d = self.treat.average_template_length
-            control_total = self.control.total * 2 # in PE mode, entire fragment is counted as 1
-                                                   # in treatment whereas both ends of fragment are counted in control/input.
-            treat_sum = self.treat.length
-            control_sum = control_total * self.treat.average_template_length
-            self.ratio_treat2control = float(treat_sum)/control_sum
-        else:
-            d = self.d
-            control_total = self.control.total
-            treat_sum = self.treat.total * self.d
-            control_sum = self.control.total * self.d
-            self.ratio_treat2control = float(treat_sum)/control_sum
-
-        if self.opt.ratio != 1.0:
-            self.ratio_treat2control = self.opt.ratio
-
-        if self.opt.tocontrol:
-            # if MACS decides to scale treatment to control data because treatment is bigger
-            effective_depth_in_million = control_total / 1000000.0
-            lambda_bg = float( control_sum )/ self.gsize
-            treat_scale = 1/self.ratio_treat2control
-        else:
-            # if MACS decides to scale control to treatment because control sample is bigger
-            effective_depth_in_million = treat_total / 1000000.0
-            lambda_bg = float( treat_sum )/ self.gsize
-            treat_scale = 1.0
-
-        # prepare d_s for control data
-        if self.sregion:
-            assert self.d <= self.sregion, f"{self.sregion:} can't be smaller than {self.d:}!"
-        if self.lregion:
-            assert self.d <= self.lregion , f"{self.lregion:} can't be smaller than {self.d:}!"
-            assert self.sregion <= self.lregion , f"{self.lregion:} can't be smaller than {self.sregion:}!"
-
-        # Now prepare a list of extension sizes
-        ctrl_d_s = [ self.d ]   # note, d doesn't make sense in PE mode.
-        # And a list of scaling factors for control
-        ctrl_scale_s = []
-
-        # d
-        if not self.opt.tocontrol:
-            # if user wants to scale everything to ChIP data
-            tmp_v = self.ratio_treat2control
-        else:
-            tmp_v = 1.0
-        ctrl_scale_s.append( tmp_v )
-
-        # slocal size local
-        if self.sregion:
-            ctrl_d_s.append( self.sregion )
-            if not self.opt.tocontrol:
-                # if user want to scale everything to ChIP data
-                tmp_v = float(self.d)/self.sregion*self.ratio_treat2control
-            else:
-                tmp_v = float(self.d)/self.sregion
-            ctrl_scale_s.append( tmp_v )
-
-        # llocal size local
-        if self.lregion and self.lregion > self.sregion:
-            ctrl_d_s.append( self.lregion )
-            if not self.opt.tocontrol:
-                # if user want to scale everything to ChIP data
-                tmp_v = float(self.d)/self.lregion*self.ratio_treat2control
-            else:
-                tmp_v = float(self.d)/self.lregion
-            ctrl_scale_s.append( tmp_v )
-
-        #if self.PE_MODE:        # first d/scale are useless in PE mode
-        #    ctrl_d_s = ctrl_d_s[1:]
-        #    ctrl_scale_s = ctrl_scale_s[1:]
-        #    print ctrl_d_s
-        #    print ctrl_scale_s
-        if self.nolambda:
-            ctrl_d_s = []
-            ctrl_scale_s = []
-
-        scorecalculator = CallerFromAlignments( self.treat, self.control,
-                                                d = d, ctrl_d_s = ctrl_d_s,
-                                                treat_scaling_factor = treat_scale,
-                                                ctrl_scaling_factor_s = ctrl_scale_s,
-                                                end_shift = self.end_shift,
-                                                lambda_bg = lambda_bg,
-                                                save_bedGraph = self.opt.store_bdg,
-                                                bedGraph_filename_prefix = self.opt.name,
-                                                bedGraph_treat_filename = self.opt.bdg_treat,
-                                                bedGraph_control_filename = self.opt.bdg_control,
-                                                save_SPMR = self.opt.do_SPMR,
-                                                cutoff_analysis_filename = self.opt.cutoff_analysis_file )
-
-        if self.opt.trackline: scorecalculator.enable_trackline()
-
-        # call peaks
-        call_summits = self.opt.call_summits
-        if call_summits: self.info("#3 Going to call summits inside each peak ...")
-
-        if self.log_pvalue != None:
-            if self.opt.broad:
-                self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
-                peaks = scorecalculator.call_broadpeaks(['p',],
-                                                    lvl1_cutoff_s=[self.log_pvalue,],
-                                                    lvl2_cutoff_s=[self.opt.log_broadcutoff,],
-                                                    min_length=self.minlen,
-                                                    lvl1_max_gap=self.maxgap,
-                                                    lvl2_max_gap=self.maxgap*4,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-            else:
-                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
-                peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
-                                                    min_length=self.minlen,
-                                                    max_gap=self.maxgap,
-                                                    call_summits=call_summits,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-        elif self.log_qvalue != None:
-            if self.opt.broad:
-                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
-                peaks = scorecalculator.call_broadpeaks(['q',],
-                                                    lvl1_cutoff_s=[self.log_qvalue,],
-                                                    lvl2_cutoff_s=[self.opt.log_broadcutoff,],
-                                                    min_length=self.minlen,
-                                                    lvl1_max_gap=self.maxgap,
-                                                    lvl2_max_gap=self.maxgap*4,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-            else:
-                peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
-                                                    min_length=self.minlen,
-                                                    max_gap=self.maxgap,
-                                                    call_summits=call_summits,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-        scorecalculator.destroy()
-        return peaks
-
-    def __call_peaks_wo_control (self):
-        """To call peaks without control data.
-
-        A peak info type is a: dictionary
-
-        key value: chromosome
-
-        items: (peak start,peak end, peak length, peak summit, peak
-        height, number of tags in peak region, peak pvalue, peak
-        fold_enrichment) <-- tuple type
-
-        While calculating pvalue:
-
-        First, t and c will be adjusted by the ratio between total
-        reads in treatment and total reads in control, depending on
-        --to-small option.
-
-        Then, t and c will be multiplied by the smallest peak size --
-        self.d.
-
-        Finally, a poisson CDF is applied to calculate one-side pvalue
-        for enrichment.
-        """
-        cdef float lambda_bg, effective_depth_in_million
-        cdef float treat_scale = 1
-        cdef float d
-        cdef list ctrl_scale_s, ctrl_d_s
-
-        if self.PE_MODE: d = 0
-        else: d = self.d
-        treat_length = self.treat.length
-        treat_total = self.treat.total
-
-        effective_depth_in_million = treat_total / 1000000.0
-
-        # global lambda
-        if self.PE_MODE:
-        #    # this an estimator, we should maybe test it for accuracy?
-            lambda_bg = treat_length / self.gsize
-        else:
-            lambda_bg = float(d) * treat_total / self.gsize
-        treat_scale = 1.0
-
-        # slocal and d-size local bias are not calculated!
-        # nothing done here. should this match w control??
-
-        if not self.nolambda:
-            if self.PE_MODE:
-                ctrl_scale_s = [ float(treat_length) / (self.lregion*treat_total*2), ]
-            else:
-                ctrl_scale_s = [ float(self.d) / self.lregion, ]
-            ctrl_d_s     = [ self.lregion, ]
-        else:
-            ctrl_scale_s = []
-            ctrl_d_s     = []
-
-        scorecalculator = CallerFromAlignments( self.treat, None,
-                                                d = d, ctrl_d_s = ctrl_d_s,
-                                                treat_scaling_factor = treat_scale,
-                                                ctrl_scaling_factor_s = ctrl_scale_s,
-                                                end_shift = self.end_shift,
-                                                lambda_bg = lambda_bg,
-                                                save_bedGraph = self.opt.store_bdg,
-                                                bedGraph_filename_prefix = self.opt.name,
-                                                bedGraph_treat_filename = self.opt.bdg_treat,
-                                                bedGraph_control_filename = self.opt.bdg_control,
-                                                save_SPMR = self.opt.do_SPMR,
-                                                cutoff_analysis_filename = self.opt.cutoff_analysis_file )
-
-        if self.opt.trackline: scorecalculator.enable_trackline()
-
-        # call peaks
-        call_summits = self.opt.call_summits
-        if call_summits: self.info("#3 Going to call summits inside each peak ...")
-
-        if self.log_pvalue != None:
-            if self.opt.broad:
-                self.info("#3 Call broad peaks with given level1 -log10pvalue cutoff and level2: %.5f, %.5f..." % (self.log_pvalue,self.opt.log_broadcutoff) )
-                peaks = scorecalculator.call_broadpeaks(['p',],
-                                                    lvl1_cutoff_s=[self.log_pvalue,],
-                                                    lvl2_cutoff_s=[self.opt.log_broadcutoff,],
-                                                    min_length=self.minlen,
-                                                    lvl1_max_gap=self.maxgap,
-                                                    lvl2_max_gap=self.maxgap*4,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-            else:
-                self.info("#3 Call peaks with given -log10pvalue cutoff: %.5f ..." % self.log_pvalue)
-                peaks = scorecalculator.call_peaks( ['p',], [self.log_pvalue,],
-                                                    min_length=self.minlen,
-                                                    max_gap=self.maxgap,
-                                                    call_summits=call_summits,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-        elif self.log_qvalue != None:
-            if self.opt.broad:
-                self.info("#3 Call broad peaks with given level1 -log10qvalue cutoff and level2: %f, %f..." % (self.log_qvalue,self.opt.log_broadcutoff) )
-                peaks = scorecalculator.call_broadpeaks(['q',],
-                                                    lvl1_cutoff_s=[self.log_qvalue,],
-                                                    lvl2_cutoff_s=[self.opt.log_broadcutoff,],
-                                                    min_length=self.minlen,
-                                                    lvl1_max_gap=self.maxgap,
-                                                    lvl2_max_gap=self.maxgap*4,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-            else:
-                peaks = scorecalculator.call_peaks( ['q',], [self.log_qvalue,],
-                                                    min_length=self.minlen,
-                                                    max_gap=self.maxgap,
-                                                    call_summits=call_summits,
-                                                    cutoff_analysis=self.opt.cutoff_analysis )
-        scorecalculator.destroy()
-        return peaks
-
diff --git a/MACS3/Signal/PeakModel.py b/MACS3/Signal/PeakModel.py
new file mode 100644
index 00000000..984e3e7b
--- /dev/null
+++ b/MACS3/Signal/PeakModel.py
@@ -0,0 +1,513 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-15 10:20:32 Tao Liu>
+"""Module Description: Build shifting model
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# Python modules
+# ------------------------------------
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+# from MACS3.Utilities.Constants import *
+from MACS3.Signal.Pileup import naive_quick_pileup, naive_call_peaks
+
+# ------------------------------------
+# Other modules
+# ------------------------------------
+import cython
+from cython.cimports.cpython import bool
+import numpy as np
+import cython.cimports.numpy as cnp
+
+# ------------------------------------
+# C lib
+# ------------------------------------
+
+
+class NotEnoughPairsException(Exception):
+    def __init__(self, value):
+        self.value = value
+
+    def __str__(self):
+        return repr(self.value)
+
+
+@cython.cclass
+class PeakModel:
+    """Peak Model class.
+    """
+    # this can be PETrackI or FWTrack
+    treatment: object
+    # genome size
+    gz: cython.double
+    max_pairnum: cython.int
+    umfold: cython.int
+    lmfold: cython.int
+    bw: cython.int
+    d_min: cython.int
+    tag_expansion_size: cython.int
+
+    info: object
+    debug: object
+    warn: object
+    error: object
+
+    summary: str
+    max_tags: cython.int
+    peaksize: cython.int
+
+    plus_line = cython.declare(cnp.ndarray, visibility="public")
+    minus_line = cython.declare(cnp.ndarray, visibility="public")
+    shifted_line = cython.declare(cnp.ndarray, visibility="public")
+    xcorr = cython.declare(cnp.ndarray, visibility="public")
+    ycorr = cython.declare(cnp.ndarray, visibility="public")
+
+    d = cython.declare(cython.int, visibility="public")
+    scan_window = cython.declare(cython.int, visibility="public")
+    min_tags = cython.declare(cython.int, visibility="public")
+    alternative_d = cython.declare(list, visibility="public")
+
+    def __init__(self, opt, treatment, max_pairnum: cython.int = 500):
+        # , double gz = 0, int umfold=30, int lmfold=10, int bw=200,
+        # int ts = 25, int bg=0, bool quiet=False):
+        self.treatment = treatment
+        self.gz = opt.gsize
+        self.umfold = opt.umfold
+        self.lmfold = opt.lmfold
+        # opt.tsize| test 10bps. The reason is that we want the best
+        # 'lag' between left & right cutting sides. A tag will be
+        # expanded to 10bps centered at cutting point.
+        self.tag_expansion_size = 10
+        # discard any predicted fragment sizes < d_min
+        self.d_min = opt.d_min
+        self.bw = opt.bw
+        self.info = opt.info
+        self.debug = opt.debug
+        self.warn = opt.warn
+        self.error = opt.warn
+        self.max_pairnum = max_pairnum
+
+    @cython.ccall
+    def build(self):
+        """Build the model. Main function of PeakModel class.
+
+        1. prepare self.d, self.scan_window, self.plus_line,
+        self.minus_line and self.shifted_line.
+
+        2. find paired + and - strand peaks
+
+        3. find the best d using x-correlation
+        """
+        paired_peakpos: dict
+        num_paired_peakpos: cython.long
+        c: bytes                # chromosome
+
+        self.peaksize = 2*self.bw
+        # mininum unique hits on single strand, decided by lmfold
+        self.min_tags = int(round(float(self.treatment.total) *
+                                  self.lmfold *
+                                  self.peaksize / self.gz / 2))
+        # maximum unique hits on single strand, decided by umfold
+        self.max_tags = int(round(float(self.treatment.total) *
+                                  self.umfold *
+                                  self.peaksize / self.gz / 2))
+        self.debug(f"#2 min_tags: {self.min_tags}; max_tags:{self.max_tags}; ")
+        self.info("#2 looking for paired plus/minus strand peaks...")
+        # find paired + and - strand peaks
+        paired_peakpos = self.__find_paired_peaks()
+
+        num_paired_peakpos = 0
+        for c in list(paired_peakpos.keys()):
+            num_paired_peakpos += len(paired_peakpos[c])
+
+        self.info("#2 Total number of paired peaks: %d" % (num_paired_peakpos))
+
+        if num_paired_peakpos < 100:
+            self.error(f"#2 MACS3 needs at least 100 paired peaks at + and - strand to build the model, but can only find {num_paired_peakpos}! Please make your MFOLD range broader and try again. If MACS3 still can't build the model, we suggest to use --nomodel and --extsize 147 or other fixed number instead.")
+            self.error("#2 Process for pairing-model is terminated!")
+            raise NotEnoughPairsException("No enough pairs to build model")
+
+        # build model, find the best d using cross-correlation
+        self.__paired_peak_model(paired_peakpos)
+
+    def __str__(self):
+        """For debug...
+
+        """
+        return """
+Summary of Peak Model:
+  Baseline: %d
+  Upperline: %d
+  Fragment size: %d
+  Scan window size: %d
+""" % (self.min_tags, self.max_tags, self.d, self.scan_window)
+
+    @cython.cfunc
+    def __find_paired_peaks(self) -> dict:
+        """Call paired peaks from fwtrackI object.
+
+        Return paired peaks center positions.
+        """
+        i: cython.int
+        chrs: list
+        chrom: bytes
+        plus_tags: cnp.ndarray(cython.int, ndim=1)
+        minus_tags: cnp.ndarray(cython.int, ndim=1)
+        plus_peaksinfo: list
+        minus_peaksinfo: list
+        paired_peaks_pos: dict  # return
+
+        chrs = list(self.treatment.get_chr_names())
+        chrs.sort()
+        paired_peaks_pos = {}
+        for i in range(len(chrs)):
+            chrom = chrs[i]
+            self.debug(f"Chromosome: {chrom}")
+            # extract tag positions
+            [plus_tags, minus_tags] = self.treatment.get_locations_by_chr(chrom)
+            # look for + strand peaks
+            plus_peaksinfo = self.__naive_find_peaks(plus_tags)
+            self.debug("Number of unique tags on + strand: %d" % (plus_tags.shape[0]))
+            self.debug("Number of peaks in + strand: %d" % (len(plus_peaksinfo)))
+            if plus_peaksinfo:
+                self.debug(f"plus peaks: first - {plus_peaksinfo[0]} ... last - {plus_peaksinfo[-1]}")
+            # look for - strand peaks
+            minus_peaksinfo = self.__naive_find_peaks(minus_tags)
+            self.debug("Number of unique tags on - strand: %d" % (minus_tags.shape[0]))
+            self.debug("Number of peaks in - strand: %d" % (len(minus_peaksinfo)))
+            if minus_peaksinfo:
+                self.debug(f"minus peaks: first - {minus_peaksinfo[0]} ... last - {minus_peaksinfo[-1]}")
+            if not plus_peaksinfo or not minus_peaksinfo:
+                self.debug("Chrom %s is discarded!" % (chrom))
+                continue
+            else:
+                paired_peaks_pos[chrom] = self.__find_pair_center(plus_peaksinfo, minus_peaksinfo)
+                self.debug("Number of paired peaks in this chromosome: %d" % (len(paired_peaks_pos[chrom])))
+        return paired_peaks_pos
+
+    @cython.cfunc
+    def __naive_find_peaks(self,
+                           taglist: cnp.ndarray(cython.int, ndim=1)) -> list:
+        """Naively call peaks based on tags counting.
+
+        Return peak positions and the tag number in peak region by a tuple list[(pos,num)].
+        """
+        peak_info: list
+        pileup_array: list
+
+        # store peak pos in every peak region and unique tag number in
+        # every peak region
+        peak_info = []
+
+        # less than 2 tags, no need to call peaks, return []
+        if taglist.shape[0] < 2:
+            return peak_info
+
+        # build pileup by extending both side to half peak size
+        pileup_array = naive_quick_pileup(taglist, int(self.peaksize/2))
+        peak_info = naive_call_peaks(pileup_array,
+                                     self.min_tags,
+                                     self.max_tags)
+
+        return peak_info
+
+    @cython.cfunc
+    def __paired_peak_model(self, paired_peakpos: dict):
+        """Use paired peak positions and treatment tag positions to
+        build the model.
+
+        Modify self.(d, model_shift size and scan_window size. and
+        extra, plus_line, minus_line and shifted_line for plotting).
+
+        """
+        window_size: cython.int
+        i: cython.int
+        chroms: list
+        paired_peakpos_chrom: object
+
+        tags_plus: cnp.ndarray(cython.int, ndim=1)
+        tags_minus: cnp.ndarray(cython.int, ndim=1)
+        plus_start: cnp.ndarray(cython.int, ndim=1)
+        plus_end: cnp.ndarray(cython.int, ndim=1)
+        minus_start: cnp.ndarray(cython.int, ndim=1)
+        minus_end: cnp.ndarray(cython.int, ndim=1)
+        plus_line: cnp.ndarray(cython.int, ndim=1)
+        minus_line: cnp.ndarray(cython.int, ndim=1)
+
+        plus_data: cnp.ndarray
+        minus_data: cnp.ndarray
+        xcorr: cnp.ndarray
+        ycorr: cnp.ndarray
+        i_l_max: cnp.ndarray
+
+        window_size = 1+2*self.peaksize+self.tag_expansion_size
+        # for plus strand pileup
+        self.plus_line = np.zeros(window_size, dtype="i4")
+        # for minus strand pileup
+        self.minus_line = np.zeros(window_size, dtype="i4")
+        # for fast pileup
+        plus_start = np.zeros(window_size, dtype="i4")
+        # for fast pileup
+        plus_end = np.zeros(window_size, dtype="i4")
+        # for fast pileup
+        minus_start = np.zeros(window_size, dtype="i4")
+        # for fast pileup
+        minus_end = np.zeros(window_size, dtype="i4")
+        self.debug("start model_add_line...")
+        chroms = list(paired_peakpos.keys())
+
+        for i in range(len(chroms)):
+            paired_peakpos_chrom = paired_peakpos[chroms[i]]
+            (tags_plus, tags_minus) = self.treatment.get_locations_by_chr(chroms[i])
+            # every paired peak has plus line and minus line
+            self.__model_add_line(paired_peakpos_chrom,
+                                  tags_plus,
+                                  plus_start,
+                                  plus_end)
+            self.__model_add_line(paired_peakpos_chrom,
+                                  tags_minus,
+                                  minus_start,
+                                  minus_end)
+
+        self.__count(plus_start, plus_end, self.plus_line)
+        self.__count(minus_start, minus_end, self.minus_line)
+
+        self.debug("start X-correlation...")
+        # Now I use cross-correlation to find the best d
+        plus_line = self.plus_line
+        minus_line = self.minus_line
+
+        # normalize first
+        minus_data = (minus_line - minus_line.mean())/(minus_line.std()*len(minus_line))
+        plus_data = (plus_line - plus_line.mean())/(plus_line.std()*len(plus_line))
+
+        # cross-correlation
+        ycorr = np.correlate(minus_data, plus_data, mode="full")[window_size-self.peaksize:window_size+self.peaksize]
+        xcorr = np.linspace(len(ycorr)//2*-1, len(ycorr)//2, num=len(ycorr))
+
+        # smooth correlation values to get rid of local maximums from small fluctuations.
+        # window size is by default 11.
+        ycorr = smooth(ycorr, window="flat")
+
+        # all local maximums could be alternative ds.
+        i_l_max = np.r_[False, ycorr[1:] > ycorr[:-1]] & np.r_[ycorr[:-1] > ycorr[1:], False]
+        i_l_max = np.where(i_l_max)[0]
+        i_l_max = i_l_max[xcorr[i_l_max] > self.d_min]
+        i_l_max = i_l_max[np.argsort(ycorr[i_l_max])[::-1]]
+
+        self.alternative_d = sorted([int(x) for x in xcorr[i_l_max]])
+        assert len(self.alternative_d) > 0, "No proper d can be found! Tweak --mfold?"
+
+        self.d = xcorr[i_l_max[0]]
+
+        self.ycorr = ycorr
+        self.xcorr = xcorr
+
+        self.scan_window = max(self.d, self.tag_expansion_size)*2
+
+        self.info("#2 Model building with cross-correlation: Done")
+
+        return True
+
+    @cython.cfunc
+    def __model_add_line(self,
+                         pos1: list,
+                         pos2: cnp.ndarray(cython.int, ndim=1),
+                         start: cnp.ndarray(cython.int, ndim=1),
+                         end: cnp.ndarray(cython.int, ndim=1)):
+        """Project each pos in pos2 which is included in
+        [pos1-self.peaksize,pos1+self.peaksize] to the line.
+
+        pos1: paired centers -- list of coordinates
+        pos2: tags of certain strand -- a numpy.array object
+        line: numpy array object where we pileup tags
+
+        """
+        i1: cython.int
+        i2: cython.int
+        i2_prev: cython.int
+        i1_max: cython.int
+        i2_max: cython.int
+        last_p2: cython.int
+        psize_adjusted1: cython.int
+        p1: cython.int
+        p2: cython.int
+        max_index: cython.int
+        s: cython.int
+        e: cython.int
+
+        i1 = 0                  # index for pos1
+        i2 = 0                  # index for pos2 index for pos2 in
+        # previous pos1 [pos1-self.peaksize,pos1+self.peaksize] region
+        i2_prev = 0
+        i1_max = len(pos1)
+        i2_max = pos2.shape[0]
+        flag_find_overlap = False
+
+        max_index = start.shape[0] - 1
+
+        # half window
+        psize_adjusted1 = self.peaksize + self.tag_expansion_size // 2
+
+        while i1 < i1_max and i2 < i2_max:
+            p1 = pos1[i1]
+            p2 = pos2[i2]
+
+            if p1-psize_adjusted1 > p2:
+                # move pos2
+                i2 += 1
+            elif p1+psize_adjusted1 < p2:
+                # move pos1
+                i1 += 1
+                i2 = i2_prev    # search minus peaks from previous index
+                flag_find_overlap = False
+            else:               # overlap!
+                if not flag_find_overlap:
+                    flag_find_overlap = True
+                    # only the first index is recorded
+                    i2_prev = i2
+                # project
+                s = max(int(p2-self.tag_expansion_size/2-p1+psize_adjusted1), 0)
+                start[s] += 1
+                e = min(int(p2+self.tag_expansion_size/2-p1+psize_adjusted1), max_index)
+                end[e] -= 1
+                i2 += 1
+        return
+
+    @cython.cfunc
+    def __count(self,
+                start: cnp.ndarray(cython.int, ndim=1),
+                end: cnp.ndarray(cython.int, ndim=1),
+                line: cnp.ndarray(cython.int, ndim=1)):
+        """
+        """
+        i: cython.int
+        pileup: cython.long
+
+        pileup = 0
+        for i in range(line.shape[0]):
+            pileup += start[i] + end[i]
+            line[i] = pileup
+        return
+
+    @cython.cfunc
+    def __find_pair_center(self,
+                           pluspeaks: list,
+                           minuspeaks: list):
+        # index for plus peaks
+        ip: cython.long = 0
+        # index for minus peaks
+        im: cython.long = 0
+        # index for minus peaks in previous plus peak
+        im_prev: cython.long = 0
+        pair_centers: list
+        ip_max: cython.long
+        im_max: cython.long
+        flag_find_overlap: bool
+        pp: cython.int
+        mp: cython.int
+        pn: cython.float
+        mn: cython.float
+
+        pair_centers = []
+        ip_max = len(pluspeaks)
+        im_max = len(minuspeaks)
+        self.debug(f"ip_max: {ip_max}; im_max: {im_max}")
+        flag_find_overlap = False
+        while ip < ip_max and im < im_max:
+            # for (peakposition, tagnumber in peak)
+            (pp, pn) = pluspeaks[ip]
+            (mp, mn) = minuspeaks[im]
+            if pp-self.peaksize > mp:
+                # move minus
+                im += 1
+            elif pp+self.peaksize < mp:
+                # move plus
+                ip += 1
+                im = im_prev    # search minus peaks from previous index
+                flag_find_overlap = False
+            else:               # overlap!
+                if not flag_find_overlap:
+                    flag_find_overlap = True
+                    # only the first index is recorded
+                    im_prev = im
+                # number tags in plus and minus peak region are comparable...
+                if pn/mn < 2 and pn/mn > 0.5:
+                    if pp < mp:
+                        pair_centers.append((pp+mp)//2)
+                im += 1
+        if pair_centers:
+            self.debug(f"Paired centers: first - {pair_centers[0]} ... second - {pair_centers[-1]} ")
+        return pair_centers
+
+
+# smooth function from SciPy cookbook:
+# http://www.scipy.org/Cookbook/SignalSmooth
+@cython.ccall
+def smooth(x,
+           window_len: cython.int = 11,
+           window: str = 'hanning'):
+    """smooth the data using a window with requested size.
+
+    This method is based on the convolution of a scaled window with the signal.
+    The signal is prepared by introducing reflected copies of the signal
+    (with the window size) in both ends so that transient parts are minimized
+    in the beginning and end part of the output signal.
+
+    input:
+        x: the input signal
+        window_len: the dimension of the smoothing window; should be
+                    an odd integer
+        window: the type of window from 'flat', 'hanning', 'hamming',
+                'bartlett', 'blackman' flat window will produce a
+                moving average smoothing.
+
+    output:
+        the smoothed signal
+
+    example:
+
+    t=linspace(-2,2,0.1)
+    x=sin(t)+randn(len(t))*0.1
+    y=smooth(x)
+
+    see also:
+
+    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman,
+    numpy.convolve scipy.signal.lfilter
+
+    TODO: the window parameter could be the window itself if an array
+          instead of a string
+
+    NOTE: length(output) != length(input), to correct this: return
+          y[(window_len/2-1):-(window_len/2)] instead of just y.
+    """
+
+    if x.ndim != 1:
+        raise ValueError("smooth only accepts 1 dimension arrays.")
+
+    if x.size < window_len:
+        raise ValueError("Input vector needs to be bigger than window size.")
+
+    if window_len < 3:
+        return x
+
+    if window not in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
+        raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")
+
+    s = np.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
+
+    if window == 'flat':        # moving average
+        w = np.ones(window_len, 'd')
+    else:
+        w = eval('np.'+window+'(window_len)')
+
+    y = np.convolve(w/w.sum(), s, mode='valid')
+    return y[(window_len//2):-(window_len//2)]
diff --git a/MACS3/Signal/PeakModel.pyx b/MACS3/Signal/PeakModel.pyx
deleted file mode 100644
index 575ce114..00000000
--- a/MACS3/Signal/PeakModel.pyx
+++ /dev/null
@@ -1,418 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2024-10-04 18:10:08 Tao Liu>
-"""Module Description: Build shifting model
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# Python modules
-# ------------------------------------
-import sys, time, random
-import array
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.Utilities.Constants import *
-from MACS3.Signal.Pileup import naive_quick_pileup, naive_call_peaks
-
-# ------------------------------------
-# Other modules
-# ------------------------------------
-from cpython cimport bool
-from cpython cimport array
-import numpy as np
-cimport numpy as np
-
-# ------------------------------------
-# C lib
-# ------------------------------------
-from libc.stdint cimport uint32_t, uint64_t, int32_t, int64_t
-
-ctypedef np.float32_t float32_t
-
-class NotEnoughPairsException(Exception):
-    def __init__ (self,value):
-        self.value = value
-    def __str__ (self):
-        return repr(self.value)
-
-cdef class PeakModel:
-    """Peak Model class.
-    """
-    cdef:
-        object treatment
-        double gz
-        int max_pairnum
-        int umfold
-        int lmfold
-        int bw
-        int d_min
-        int tag_expansion_size
-        object info, debug, warn, error
-        str summary
-        int max_tags
-        int peaksize
-        public np.ndarray plus_line, minus_line, shifted_line
-        public int d
-        public int scan_window
-        public int min_tags
-        public list alternative_d
-        public np.ndarray xcorr, ycorr
-
-    def __init__ ( self, opt , treatment, int max_pairnum=500 ): #, double gz = 0, int umfold=30, int lmfold=10, int bw=200, int ts = 25, int bg=0, bool quiet=False):
-        self.treatment = treatment
-        self.gz = opt.gsize
-        self.umfold = opt.umfold
-        self.lmfold = opt.lmfold
-        self.tag_expansion_size = 10         #opt.tsize| test 10bps. The reason is that we want the best 'lag' between left & right cutting sides. A tag will be expanded to 10bps centered at cutting point.
-        self.d_min = opt.d_min               #discard any predicted fragment sizes < d_min
-        self.bw = opt.bw
-        self.info  = opt.info
-        self.debug = opt.debug
-        self.warn  = opt.warn
-        self.error = opt.warn
-        self.max_pairnum = max_pairnum
-
-    cpdef build (self):
-        """Build the model. Main function of PeakModel class.
-
-        1. prepare self.d, self.scan_window, self.plus_line,
-        self.minus_line and self.shifted_line.
-
-        2. find paired + and - strand peaks
-
-        3. find the best d using x-correlation
-        """
-        cdef:
-            dict paired_peakpos
-            long num_paired_peakpos
-            bytes c                       #chromosome
-
-        self.peaksize = 2*self.bw
-        self.min_tags = int(round(float(self.treatment.total) * self.lmfold * self.peaksize / self.gz / 2)) # mininum unique hits on single strand, decided by lmfold
-        self.max_tags = int(round(float(self.treatment.total) * self.umfold * self.peaksize / self.gz /2)) # maximum unique hits on single strand, decided by umfold
-        self.debug( f"#2 min_tags: {self.min_tags}; max_tags:{self.max_tags}; " )
-        
-        self.info( "#2 looking for paired plus/minus strand peaks..." )
-        # find paired + and - strand peaks
-        paired_peakpos = self.__find_paired_peaks ()
-
-        num_paired_peakpos = 0
-        for c in list( paired_peakpos.keys() ):
-            num_paired_peakpos += len (paired_peakpos[c] )
-
-        self.info("#2 Total number of paired peaks: %d" % (num_paired_peakpos))
-
-        if num_paired_peakpos < 100:
-            self.error(f"#2 MACS3 needs at least 100 paired peaks at + and - strand to build the model, but can only find {num_paired_peakpos}! Please make your MFOLD range broader and try again. If MACS3 still can't build the model, we suggest to use --nomodel and --extsize 147 or other fixed number instead.")
-            self.error("#2 Process for pairing-model is terminated!")
-            raise NotEnoughPairsException("No enough pairs to build model")
-
-        # build model, find the best d using cross-correlation
-        self.__paired_peak_model(paired_peakpos)
-
-    def __str__ (self):
-        """For debug...
-
-        """
-        return """
-Summary of Peak Model:
-  Baseline: %d
-  Upperline: %d
-  Fragment size: %d
-  Scan window size: %d
-""" % (self.min_tags,self.max_tags,self.d,self.scan_window)
-
-    cdef dict __find_paired_peaks (self):
-        """Call paired peaks from fwtrackI object.
-
-        Return paired peaks center positions.
-        """
-        cdef:
-           int i
-           list chrs
-           bytes chrom
-           np.ndarray[np.int32_t, ndim=1] plus_tags, minus_tags
-           list plus_peaksinfo
-           list minus_peaksinfo
-           dict paired_peaks_pos # return
-
-        chrs = list(self.treatment.get_chr_names())
-        chrs.sort()
-        paired_peaks_pos = {}
-        for i in range( len(chrs) ):
-            chrom = chrs[ i ]
-            self.debug( f"Chromosome: {chrom}" )
-            # extract tag positions
-            [ plus_tags, minus_tags ] = self.treatment.get_locations_by_chr( chrom )
-            # look for + strand peaks
-            plus_peaksinfo = self.__naive_find_peaks ( plus_tags )
-            self.debug("Number of unique tags on + strand: %d" % ( plus_tags.shape[0] ) )
-            self.debug("Number of peaks in + strand: %d" % ( len(plus_peaksinfo) ) )
-            if plus_peaksinfo:
-                self.debug(f"plus peaks: first - {plus_peaksinfo[0]} ... last - {plus_peaksinfo[-1]}")
-            # look for - strand peaks
-            minus_peaksinfo = self.__naive_find_peaks ( minus_tags )
-            self.debug("Number of unique tags on - strand: %d" % ( minus_tags.shape[0] ) )
-            self.debug("Number of peaks in - strand: %d" % ( len( minus_peaksinfo ) ) )
-            if minus_peaksinfo:
-                self.debug(f"minus peaks: first - {minus_peaksinfo[0]} ... last - {minus_peaksinfo[-1]}")
-            if not plus_peaksinfo or not minus_peaksinfo:
-                self.debug("Chrom %s is discarded!" % (chrom) )
-                continue
-            else:
-                paired_peaks_pos[chrom] = self.__find_pair_center (plus_peaksinfo, minus_peaksinfo)
-                self.debug("Number of paired peaks in this chromosome: %d" %(len(paired_peaks_pos[chrom])))
-        return paired_peaks_pos
-
-    cdef list __naive_find_peaks ( self, np.ndarray[np.int32_t, ndim=1] taglist ):
-        """Naively call peaks based on tags counting.
-
-        Return peak positions and the tag number in peak region by a tuple list [(pos,num)].
-        """
-        cdef:
-            long i
-            int pos
-            list peak_info
-            list pileup_array
-
-        peak_info = []    # store peak pos in every peak region and
-                          # unique tag number in every peak region
-        if taglist.shape[0] < 2: # less than 2 tags, no need to call peaks, return []
-            return peak_info
-
-        pileup_array = naive_quick_pileup( taglist, int(self.peaksize/2) ) # build pileup by extending both side to half peak size
-        peak_info = naive_call_peaks( pileup_array, self.min_tags, self.max_tags )
-
-        return peak_info
-
-    cdef __paired_peak_model (self, dict paired_peakpos,):
-        """Use paired peak positions and treatment tag positions to build the model.
-
-        Modify self.(d, model_shift size and scan_window size. and extra, plus_line, minus_line and shifted_line for plotting).
-        """
-        cdef:
-            int window_size, i
-            list chroms
-            object paired_peakpos_chrom
-            np.ndarray[np.int32_t, ndim=1] tags_plus, tags_minus, plus_start, plus_end, minus_start, minus_end, plus_line, minus_line
-            np.ndarray plus_data, minus_data, xcorr, ycorr, i_l_max
-
-        window_size = 1+2*self.peaksize+self.tag_expansion_size
-        self.plus_line = np.zeros(window_size, dtype="int32") # for plus strand pileup
-        self.minus_line = np.zeros(window_size, dtype="int32")# for minus strand pileup
-        plus_start = np.zeros(window_size, dtype="int32")     # for fast pileup
-        plus_end = np.zeros(window_size, dtype="int32")       # for fast pileup
-        minus_start = np.zeros(window_size, dtype="int32")    # for fast pileup
-        minus_end = np.zeros(window_size, dtype="int32")      # for fast pileup
-        self.debug("start model_add_line...")
-        chroms = list(paired_peakpos.keys())
-
-        for i in range(len(chroms)):
-            paired_peakpos_chrom = paired_peakpos[chroms[i]]
-            (tags_plus, tags_minus) = self.treatment.get_locations_by_chr(chroms[i])
-            # every paired peak has plus line and minus line
-            self.__model_add_line (paired_peakpos_chrom, tags_plus, plus_start, plus_end) #, plus_strand=1)
-            self.__model_add_line (paired_peakpos_chrom, tags_minus, minus_start, minus_end) #, plus_strand=0)
-
-        self.__count ( plus_start, plus_end, self.plus_line )
-        self.__count ( minus_start, minus_end, self.minus_line )
-
-        self.debug("start X-correlation...")
-        # Now I use cross-correlation to find the best d
-        plus_line = self.plus_line
-        minus_line = self.minus_line
-
-        # normalize first
-        minus_data = (minus_line - minus_line.mean())/(minus_line.std()*len(minus_line))
-        plus_data = (plus_line - plus_line.mean())/(plus_line.std()*len(plus_line))
-
-        # cross-correlation
-        ycorr = np.correlate(minus_data,plus_data,mode="full")[window_size-self.peaksize:window_size+self.peaksize]
-        xcorr = np.linspace(len(ycorr)//2*-1, len(ycorr)//2, num=len(ycorr))
-
-        # smooth correlation values to get rid of local maximums from small fluctuations.
-        ycorr = smooth(ycorr, window="flat") # window size is by default 11.
-
-        # all local maximums could be alternative ds.
-        i_l_max = np.r_[False, ycorr[1:] > ycorr[:-1]] & np.r_[ycorr[:-1] > ycorr[1:], False]
-        i_l_max = np.where(i_l_max)[0]
-        i_l_max = i_l_max[ xcorr[i_l_max] > self.d_min ]
-        i_l_max = i_l_max[ np.argsort(ycorr[i_l_max])[::-1]]
-
-        self.alternative_d = sorted([int(x) for x in xcorr[i_l_max]])
-        assert len(self.alternative_d) > 0, "No proper d can be found! Tweak --mfold?"
-
-        self.d = xcorr[i_l_max[0]]
-
-        self.ycorr = ycorr
-        self.xcorr = xcorr
-
-        self.scan_window = max(self.d,self.tag_expansion_size)*2
-
-        self.info("#2 Model building with cross-correlation: Done")
-
-        return True
-
-    cdef __model_add_line (self, list pos1, np.ndarray[np.int32_t, ndim=1] pos2, np.ndarray[np.int32_t, ndim=1] start, np.ndarray[np.int32_t, ndim=1] end): #, int plus_strand=1):
-        """Project each pos in pos2 which is included in
-        [pos1-self.peaksize,pos1+self.peaksize] to the line.
-
-        pos1: paired centers -- list of coordinates
-        pos2: tags of certain strand -- a numpy.array object
-        line: numpy array object where we pileup tags
-
-        """
-        cdef:
-            int i1, i2, i2_prev, i1_max, i2_max, last_p2, psize_adjusted1, psize_adjusted2, p1, p2, max_index, s, e
-
-        i1 = 0                  # index for pos1
-        i2 = 0                  # index for pos2
-        i2_prev = 0             # index for pos2 in previous pos1
-                                # [pos1-self.peaksize,pos1+self.peaksize]
-                                # region
-        i1_max = len(pos1)
-        i2_max = pos2.shape[0]
-        last_p2 = -1
-        flag_find_overlap = False
-
-        max_index = start.shape[0] - 1
-
-        psize_adjusted1 = self.peaksize + self.tag_expansion_size // 2 # half window
-
-        while i1<i1_max and i2<i2_max:
-            p1 = pos1[i1]
-            p2 = pos2[i2] #- self.tag_expansion_size/2
-
-            if p1-psize_adjusted1 > p2: # move pos2
-                i2 += 1
-            elif p1+psize_adjusted1 < p2: # move pos1
-                i1 += 1
-                i2 = i2_prev    # search minus peaks from previous index
-                flag_find_overlap = False
-            else:               # overlap!
-                if not flag_find_overlap:
-                    flag_find_overlap = True
-                    i2_prev = i2 # only the first index is recorded
-                # project
-                s = max(int(p2-self.tag_expansion_size/2-p1+psize_adjusted1), 0)
-                start[s] += 1
-                e = min(int(p2+self.tag_expansion_size/2-p1+psize_adjusted1), max_index)
-                end[e] -= 1
-                i2+=1
-        return
-
-    cdef __count ( self, np.ndarray[np.int32_t, ndim=1] start, np.ndarray[np.int32_t, ndim=1] end, np.ndarray[np.int32_t, ndim=1] line ):
-        """
-        """
-        cdef:
-            int i
-            long pileup
-        pileup = 0
-        for i in range(line.shape[0]):
-            pileup += start[i] + end[i]
-            line[i] = pileup
-        return
-
-    cdef __find_pair_center (self, list pluspeaks, list minuspeaks):
-        cdef:
-            long ip = 0                  # index for plus peaks
-            long im = 0                  # index for minus peaks
-            long im_prev = 0             # index for minus peaks in previous plus peak
-            list pair_centers
-            long ip_max
-            long im_max
-            bool flag_find_overlap
-            int pp, mp
-            float pn, mn
-
-        pair_centers = []
-        ip_max = len(pluspeaks)
-        im_max = len(minuspeaks)
-        self.debug(f"ip_max: {ip_max}; im_max: {im_max}")
-        flag_find_overlap = False
-        while ip<ip_max and im<im_max:
-            (pp,pn) = pluspeaks[ip] # for (peakposition, tagnumber in peak)
-            (mp,mn) = minuspeaks[im]
-            if pp-self.peaksize > mp: # move minus
-                im += 1
-            elif pp+self.peaksize < mp: # move plus
-                ip += 1
-                im = im_prev    # search minus peaks from previous index
-                flag_find_overlap = False
-            else:               # overlap!
-                if not flag_find_overlap:
-                    flag_find_overlap = True
-                    im_prev = im # only the first index is recorded
-                if pn/mn < 2 and pn/mn > 0.5: # number tags in plus and minus peak region are comparable...
-                    if pp < mp:
-                        pair_centers.append((pp+mp)//2)
-                        #self.debug ( "distance: %d, minus: %d, plus: %d" % (mp-pp,mp,pp))
-                im += 1
-        if pair_centers:
-            self.debug(f"Paired centers: first - {pair_centers[0]} ... second - {pair_centers[-1]} ")
-        return pair_centers
-
-# smooth function from SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth
-cpdef smooth(x, int window_len=11, str window='hanning'):
-    """smooth the data using a window with requested size.
-
-    This method is based on the convolution of a scaled window with the signal.
-    The signal is prepared by introducing reflected copies of the signal
-    (with the window size) in both ends so that transient parts are minimized
-    in the beginning and end part of the output signal.
-
-    input:
-        x: the input signal
-        window_len: the dimension of the smoothing window; should be an odd integer
-        window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'
-            flat window will produce a moving average smoothing.
-
-    output:
-        the smoothed signal
-
-    example:
-
-    t=linspace(-2,2,0.1)
-    x=sin(t)+randn(len(t))*0.1
-    y=smooth(x)
-
-    see also:
-
-    numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve
-    scipy.signal.lfilter
-
-    TODO: the window parameter could be the window itself if an array instead of a string
-    NOTE: length(output) != length(input), to correct this: return y[(window_len/2-1):-(window_len/2)] instead of just y.
-    """
-
-    if x.ndim != 1:
-        raise ValueError, "smooth only accepts 1 dimension arrays."
-
-    if x.size < window_len:
-        raise ValueError, "Input vector needs to be bigger than window size."
-
-
-    if window_len<3:
-        return x
-
-
-    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
-        raise ValueError, "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'"
-
-
-    s=np.r_[x[window_len-1:0:-1],x,x[-1:-window_len:-1]]
-    #print(len(s))
-    if window == 'flat': #moving average
-        w=np.ones(window_len,'d')
-    else:
-        w=eval('np.'+window+'(window_len)')
-
-    y=np.convolve(w/w.sum(),s,mode='valid')
-    return y[(window_len//2):-(window_len//2)]
-
diff --git a/MACS3/Signal/PeakVariants.py b/MACS3/Signal/PeakVariants.py
new file mode 100644
index 00000000..451b38b0
--- /dev/null
+++ b/MACS3/Signal/PeakVariants.py
@@ -0,0 +1,402 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-22 17:12:29 Tao Liu>
+
+"""Module for SAPPER PeakVariants class.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+from copy import copy
+import cython
+from cython.cimports.cpython import bool
+
+
+@cython.cclass
+class Variant:
+    v_ref_pos: cython.long
+    v_ref_allele: str
+    v_alt_allele: str
+    v_GQ: cython.int
+    v_filter: str
+    v_type: str
+    v_mutation_type: str
+    v_top1allele: str
+    v_top2allele: str
+    v_DPT: cython.int
+    v_DPC: cython.int
+    v_DP1T: cython.int
+    v_DP2T: cython.int
+    v_DP1C: cython.int
+    v_DP2C: cython.int
+    v_PLUS1T: cython.int
+    v_PLUS2T: cython.int
+    v_MINUS1T: cython.int
+    v_MINUS2T: cython.int
+    v_deltaBIC: cython.float
+    v_BIC_homo_major: cython.float
+    v_BIC_homo_minor: cython.float
+    v_BIC_heter_noAS: cython.float
+    v_BIC_heter_AS: cython.float
+    v_AR: cython.float
+    v_GT: str
+    v_DP: cython.int
+    v_PL_00: cython.int
+    v_PL_01: cython.int
+    v_PL_11: cython.int
+
+    def __init__(self,
+                 ref_allele: str,
+                 alt_allele: str,
+                 GQ: cython.int,
+                 filter: str,
+                 type: str,
+                 mutation_type: str,
+                 top1allele: str,
+                 top2allele: str,
+                 DPT: cython.int,
+                 DPC: cython.int,
+                 DP1T: cython.int,
+                 DP2T: cython.int,
+                 DP1C: cython.int,
+                 DP2C: cython.int,
+                 PLUS1T: cython.int,
+                 PLUS2T: cython.int,
+                 MINUS1T: cython.int,
+                 MINUS2T: cython.int,
+                 deltaBIC: cython.float,
+                 BIC_homo_major: cython.float,
+                 BIC_homo_minor: cython.float,
+                 BIC_heter_noAS: cython.float,
+                 BIC_heter_AS: cython.float,
+                 AR: cython.float,
+                 GT: str,
+                 DP: cython.int,
+                 PL_00: cython.int,
+                 PL_01: cython.int,
+                 PL_11: cython.int):
+        self.v_ref_allele = ref_allele
+        self.v_alt_allele = alt_allele
+        self.v_GQ = GQ
+        self.v_filter = filter
+        self.v_type = type
+        self.v_mutation_type = mutation_type
+        self.v_top1allele = top1allele
+        self.v_top2allele = top2allele
+        self.v_DPT = DPT
+        self.v_DPC = DPC
+        self.v_DP1T = DP1T
+        self.v_DP2T = DP2T
+        self.v_DP1C = DP1C
+        self.v_DP2C = DP2C
+        self.v_PLUS1T = PLUS1T
+        self.v_PLUS2T = PLUS2T
+        self.v_MINUS1T = MINUS1T
+        self.v_MINUS2T = MINUS2T
+        self.v_deltaBIC = deltaBIC
+        self.v_BIC_homo_major = BIC_homo_major
+        self.v_BIC_homo_minor = BIC_homo_minor
+        self.v_BIC_heter_noAS = BIC_heter_noAS
+        self.v_BIC_heter_AS = BIC_heter_AS
+        self.v_AR = AR
+        self.v_GT = GT
+        self.v_DP = DP
+        self.v_PL_00 = PL_00
+        self.v_PL_01 = PL_01
+        self.v_PL_11 = PL_11
+
+    def __getstate__(self):
+        # self.v_ref_pos,
+        return (self.v_ref_allele,
+                self.v_alt_allele,
+                self.v_GQ,
+                self.v_filter,
+                self.v_type,
+                self.v_mutation_type,
+                self.v_top1allele,
+                self.v_top2allele,
+                self.v_DPT,
+                self.v_DPC,
+                self.v_DP1T,
+                self.v_DP2T,
+                self.v_DP1C,
+                self.v_DP2C,
+                self.v_PLUS1T,
+                self.v_PLUS2T,
+                self.v_MINUS1T,
+                self.v_MINUS2T,
+                self.v_deltaBIC,
+                self.v_BIC_homo_major,
+                self.v_BIC_homo_minor,
+                self.v_BIC_heter_noAS,
+                self.v_BIC_heter_AS,
+                self.v_AR,
+                self.v_GT,
+                self.v_DP,
+                self.v_PL_00,
+                self.v_PL_01,
+                self.v_PL_11)
+
+    def __setstate__(self, state):
+        # self.v_ref_pos,
+        (self.v_ref_allele,
+         self.v_alt_allele,
+         self.v_GQ,
+         self.v_filter,
+         self.v_type,
+         self.v_mutation_type,
+         self.v_top1allele,
+         self.v_top2allele,
+         self.v_DPT,
+         self.v_DPC,
+         self.v_DP1T,
+         self.v_DP2T,
+         self.v_DP1C,
+         self.v_DP2C,
+         self.v_PLUS1T,
+         self.v_PLUS2T,
+         self.v_MINUS1T,
+         self.v_MINUS2T,
+         self.v_deltaBIC,
+         self.v_BIC_homo_major,
+         self.v_BIC_homo_minor,
+         self.v_BIC_heter_noAS,
+         self.v_BIC_heter_AS,
+         self.v_AR,
+         self.v_GT,
+         self.v_DP,
+         self.v_PL_00,
+         self.v_PL_01,
+         self.v_PL_11) = state
+
+    @cython.ccall
+    def is_indel(self) -> bool:
+        if self.v_mutation_type.find("Insertion") != -1 or self.v_mutation_type.find("Deletion") != -1:
+            return True
+        else:
+            return False
+
+    @cython.ccall
+    def is_only_del(self) -> bool:
+        if self.v_mutation_type == "Deletion":
+            return True
+        else:
+            return False
+
+    @cython.ccall
+    def is_only_insertion(self) -> bool:
+        if self.v_mutation_type == "Insertion":
+            return True
+        else:
+            return False
+
+    def __getitem__(self, keyname):
+        if keyname == "ref_allele":
+            return self.v_ref_allele
+        elif keyname == "alt_allele":
+            return self.v_alt_allele
+        elif keyname == "top1allele":
+            return self.v_top1allele
+        elif keyname == "top2allele":
+            return self.v_top2allele
+        elif keyname == "type":
+            return self.type
+        elif keyname == "mutation_type":
+            return self.mutation_type
+        else:
+            raise Exception("keyname is not accessible:", keyname)
+
+    def __setitem__(self, keyname, v):
+        if keyname == "ref_allele":
+            self.v_ref_allele = v
+        elif keyname == "alt_allele":
+            self.v_alt_allele = v
+        elif keyname == "top1allele":
+            self.v_top1allele = v
+        elif keyname == "top2allele":
+            self.v_top2allele = v
+        elif keyname == "type":
+            self.type = v
+        elif keyname == "mutation_type":
+            self.mutation_type = v
+        else:
+            raise Exception("keyname is not accessible:", keyname)
+
+    @cython.ccall
+    def is_refer_biased_01(self,
+                           ar: cython.float = 0.85) -> bool:
+        if self.v_AR >= ar and self.v_ref_allele == self.v_top1allele:
+            return True
+        else:
+            return False
+
+    @cython.ccall
+    def top1isreference(self) -> bool:
+        if self.v_ref_allele == self.v_top1allele:
+            return True
+        else:
+            return False
+
+    @cython.ccall
+    def top2isreference(self) -> bool:
+        if self.v_ref_allele == self.v_top2allele:
+            return True
+        else:
+            return False        
+        
+    @cython.ccall
+    def toVCF(self) -> str:
+        return "\t".join((self.v_ref_allele, self.v_alt_allele, "%d" % self.v_GQ, self.v_filter,
+                          "M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" %
+                          (self.v_type, self.v_mutation_type, self.v_DPT, self.v_DPC, self.v_DP1T, self.v_top1allele,
+                           self.v_DP2T, self.v_top2allele, self.v_DP1C, self.v_top1allele, self.v_DP2C, self.v_top2allele,
+                           self.v_PLUS1T, self.v_PLUS2T, self.v_MINUS1T, self.v_MINUS2T,
+                           self.v_deltaBIC,
+                           self.v_BIC_homo_major, self.v_BIC_homo_minor, self.v_BIC_heter_noAS,self.v_BIC_heter_AS,
+                           self.v_AR
+                           ),
+                          "GT:DP:GQ:PL",
+                          "%s:%d:%d:%d,%d,%d" % (self.v_GT, self.v_DP, self.v_GQ, self.v_PL_00, self.v_PL_01, self.v_PL_11)
+                          ))
+
+
+@cython.cclass
+class PeakVariants:
+    chrom: str
+    d_Variants: dict
+    start: cython.long
+    end: cython.long
+    refseq: bytes
+
+    def __init__(self,
+                 chrom: str,
+                 start: cython.long,
+                 end: cython.long,
+                 s: bytes):
+        self.chrom = chrom
+        self.d_Variants = {}
+        self.start = start
+        self.end = end
+        self.refseq = s
+
+    def __getstate__(self):
+        return (self.d_Variants, self.chrom)
+
+    def __setstate__(self, state):
+        (self.d_Variants, self.chrom) = state
+
+    @cython.ccall
+    def n_variants(self) -> cython.int:
+        return len(self.d_Variants)
+
+    @cython.ccall
+    def add_variant(self, p: cython.long, v: Variant):
+        self.d_Variants[p] = v
+
+    @cython.ccall
+    def has_indel(self) -> bool:
+        p: cython.long
+
+        for p in sorted(self.d_Variants.keys()):
+            if self.d_Variants[p].is_indel():
+                return True
+        return False
+
+    @cython.ccall
+    def has_refer_biased_01(self) -> bool:
+        p: cython.long
+
+        for p in sorted(self.d_Variants.keys()):
+            if self.d_Variants[p].is_refer_biased_01():
+                return True
+        return False
+
+    @cython.ccall
+    def get_refer_biased_01s(self) -> list:
+        ret_poss: list = []
+        p: cython.long
+
+        for p in sorted(self.d_Variants.keys()):
+            if self.d_Variants[p].is_refer_biased_01():
+                ret_poss.append(p)
+        return ret_poss
+
+    @cython.ccall
+    def remove_variant(self, p: cython.long):
+        assert p in self.d_Variants
+        self.d_Variants.pop(p)
+
+    @cython.ccall
+    def replace_variant(self, p: cython.long, v: Variant):
+        assert p in self.d_Variants
+        self.d_Variants[p] = v
+
+    @cython.ccall
+    def fix_indels(self):
+        p0: cython.long
+        p1: cython.long
+        p: cython.long
+
+        # merge continuous deletion
+        p0 = -1                           #start of deletion chunk
+        p1 = -1                           #end of deletion chunk
+        for p in sorted(self.d_Variants.keys()):
+            if p == p1+1 and self.d_Variants[p].is_only_del() and self.d_Variants[p0].is_only_del():
+                # we keep p0, remove p, and add p's ref_allele to p0, keep other information as in p0
+                if self.d_Variants[p0].top1isreference:
+                    if self.d_Variants[p0]["top1allele"] == "*":
+                        self.d_Variants[p0]["top1allele"] = ""
+                    self.d_Variants[p0]["top1allele"] += self.d_Variants[p]["ref_allele"]
+                elif self.d_Variants[p0].top2isreference:
+                    if self.d_Variants[p0]["top2allele"] == "*":
+                        self.d_Variants[p0]["top2allele"] = ""
+                    self.d_Variants[p0]["top2allele"] += self.d_Variants[p]["ref_allele"]
+                self.d_Variants[p0]["ref_allele"] += self.d_Variants[p]["ref_allele"]
+                self.d_Variants.pop(p)
+                p1 = p
+            else:
+                p0 = p
+                p1 = p
+
+        # fix deletion so that if the preceding base is 0/0 -- i.e. not in d_Variants, the reference base will be added.
+        for p in sorted(self.d_Variants.keys()):
+            if self.d_Variants[p].is_only_del():
+                if not ((p-1) in self.d_Variants):
+                    if p > self.start:  # now add the reference base
+                        self.d_Variants[p-1] = copy(self.d_Variants[p])
+                        rs = str(self.refseq)
+                        self.d_Variants[p-1]["ref_allele"] = rs[p - self.start] + self.d_Variants[p-1]["ref_allele"]
+                        self.d_Variants[p-1]["alt_allele"] = rs[p - self.start]
+                        if self.d_Variants[p].top1isreference:
+                            self.d_Variants[p-1]["top1allele"] = self.d_Variants[p-1]["ref_allele"]
+                            self.d_Variants[p-1]["top2allele"] = self.d_Variants[p-1]["alt_allele"]
+                        elif self.d_Variants[p].top2isreference:
+                            self.d_Variants[p-1]["top1allele"] = self.d_Variants[p-1]["alt_allele"]
+                            self.d_Variants[p-1]["top2allele"] = self.d_Variants[p-1]["ref_allele"]
+                        self.d_Variants.pop(p)
+
+        # remove indel if a deletion is immediately following an
+        # insertion -- either a third genotype is found which is not
+        # allowed in this version of sapper, or problem caused by
+        # assembling in a simple repeat region.
+        for p in sorted(self.d_Variants.keys()):
+            if self.d_Variants[p].is_only_del():
+                if (p-1) in self.d_Variants and self.d_Variants[p-1].is_only_insertion():
+                    self.d_Variants.pop(p)
+                    self.d_Variants.pop(p - 1)
+        return
+
+    @cython.ccall
+    def toVCF(self) -> str:
+        p: cython.long
+        res: str
+
+        res = ""
+        for p in sorted(self.d_Variants.keys()):
+            res += "\t".join((self.chrom, str(p+1), ".", self.d_Variants[p].toVCF())) + "\n"
+        return res
diff --git a/MACS3/Signal/PeakVariants.pyx b/MACS3/Signal/PeakVariants.pyx
deleted file mode 100644
index 485988d1..00000000
--- a/MACS3/Signal/PeakVariants.pyx
+++ /dev/null
@@ -1,358 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2020-12-04 22:11:09 Tao Liu>
-
-"""Module for SAPPER PeakVariants class.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included
-with the distribution).
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-from copy import copy
-from cpython cimport bool
-
-cdef class Variant:
-    cdef:
-        long v_ref_pos
-        str v_ref_allele
-        str v_alt_allele
-        int v_GQ
-        str v_filter
-        str v_type
-        str v_mutation_type
-        str v_top1allele
-        str v_top2allele
-        int v_DPT
-        int v_DPC
-        int v_DP1T
-        int v_DP2T
-        int v_DP1C
-        int v_DP2C
-        int v_PLUS1T
-        int v_PLUS2T
-        int v_MINUS1T
-        int v_MINUS2T
-        float v_deltaBIC
-        float v_BIC_homo_major
-        float v_BIC_homo_minor
-        float v_BIC_heter_noAS
-        float v_BIC_heter_AS
-        float v_AR
-        str v_GT
-        int v_DP
-        int v_PL_00
-        int v_PL_01
-        int v_PL_11
-        
-    def __init__ ( self, str ref_allele, str alt_allele, int GQ, str filter, str type, str mutation_type,
-                   str top1allele, str top2allele, int DPT, int DPC, int DP1T, int DP2T, int DP1C, int DP2C,
-                   int PLUS1T, int PLUS2T, int MINUS1T, int MINUS2T, 
-                   float deltaBIC, float BIC_homo_major, float BIC_homo_minor, float BIC_heter_noAS, float BIC_heter_AS,
-                   float AR, str GT, int DP, int PL_00, int PL_01, int PL_11):
-        self.v_ref_allele = ref_allele
-        self.v_alt_allele = alt_allele
-        self.v_GQ = GQ
-        self.v_filter = filter
-        self.v_type = type
-        self.v_mutation_type = mutation_type
-        self.v_top1allele = top1allele
-        self.v_top2allele = top2allele
-        self.v_DPT = DPT
-        self.v_DPC = DPC
-        self.v_DP1T = DP1T
-        self.v_DP2T = DP2T
-        self.v_DP1C = DP1C
-        self.v_DP2C = DP2C
-        self.v_PLUS1T = PLUS1T
-        self.v_PLUS2T = PLUS2T
-        self.v_MINUS1T = MINUS1T
-        self.v_MINUS2T = MINUS2T
-        self.v_deltaBIC = deltaBIC
-        self.v_BIC_homo_major = BIC_homo_major
-        self.v_BIC_homo_minor = BIC_homo_minor
-        self.v_BIC_heter_noAS = BIC_heter_noAS
-        self.v_BIC_heter_AS = BIC_heter_AS
-        self.v_AR = AR
-        self.v_GT = GT
-        self.v_DP = DP
-        self.v_PL_00 = PL_00
-        self.v_PL_01 = PL_01
-        self.v_PL_11 = PL_11
-
-    def __getstate__ ( self ):
-        return (
-            #self.v_ref_pos,
-            self.v_ref_allele,
-            self.v_alt_allele,
-            self.v_GQ,
-            self.v_filter,
-            self.v_type,
-            self.v_mutation_type,
-            self.v_top1allele,
-            self.v_top2allele,
-            self.v_DPT,
-            self.v_DPC,
-            self.v_DP1T,
-            self.v_DP2T,
-            self.v_DP1C,
-            self.v_DP2C,
-            self.v_PLUS1T,
-            self.v_PLUS2T,
-            self.v_MINUS1T,
-            self.v_MINUS2T,
-            self.v_deltaBIC,
-            self.v_BIC_homo_major,
-            self.v_BIC_homo_minor,
-            self.v_BIC_heter_noAS,
-            self.v_BIC_heter_AS,
-            self.v_AR,
-            self.v_GT,
-            self.v_DP,
-            self.v_PL_00,
-            self.v_PL_01,
-            self.v_PL_11 )
-
-    def __setstate__ ( self, state ):
-        ( #self.v_ref_pos,
-          self.v_ref_allele,
-          self.v_alt_allele,
-          self.v_GQ,
-          self.v_filter,
-          self.v_type,
-          self.v_mutation_type,
-          self.v_top1allele,
-          self.v_top2allele,
-          self.v_DPT,
-          self.v_DPC,
-          self.v_DP1T,
-          self.v_DP2T,
-          self.v_DP1C,
-          self.v_DP2C,
-          self.v_PLUS1T,
-          self.v_PLUS2T,
-          self.v_MINUS1T,
-          self.v_MINUS2T,
-          self.v_deltaBIC,
-          self.v_BIC_homo_major,
-          self.v_BIC_homo_minor,
-          self.v_BIC_heter_noAS,
-          self.v_BIC_heter_AS,
-          self.v_AR,
-          self.v_GT,
-          self.v_DP,
-          self.v_PL_00,
-          self.v_PL_01,
-          self.v_PL_11 ) = state
-
-    cpdef bool is_indel ( self ):
-        if self.v_mutation_type.find("Insertion") != -1 or self.v_mutation_type.find("Deletion") != -1:
-            return True
-        else:
-            return False
-
-    cpdef bool is_only_del ( self ):
-        if self.v_mutation_type == "Deletion":
-            return True
-        else:
-            return False
-
-    cpdef bool is_only_insertion ( self ):
-        if self.v_mutation_type == "Insertion":
-            return True
-        else:
-            return False
-
-    def __getitem__ ( self, keyname ):
-        if keyname == "ref_allele":
-            return self.v_ref_allele
-        elif keyname == "alt_allele":
-            return self.v_alt_allele
-        elif keyname == "top1allele":
-            return self.v_top1allele
-        elif keyname == "top2allele":
-            return self.v_top2allele        
-        elif keyname == "type":
-            return self.type
-        elif keyname == "mutation_type":
-            return self.mutation_type
-        else:
-            raise Exception("keyname is not accessible:", keyname)
-
-    def __setitem__ ( self, keyname, v ):
-        if keyname == "ref_allele":
-            self.v_ref_allele = v
-        elif keyname == "alt_allele":
-            self.v_alt_allele = v
-        elif keyname == "top1allele":
-            self.v_top1allele = v
-        elif keyname == "top2allele":
-            self.v_top2allele = v
-        elif keyname == "type":
-            self.type = v
-        elif keyname == "mutation_type":
-            self.mutation_type = v
-        else:
-            raise Exception("keyname is not accessible:", keyname)        
-        
-    cpdef bool is_refer_biased_01 ( self, float ar=0.85 ):
-        if self.v_AR >= ar and self.v_ref_allele == self.v_top1allele:
-            return True
-        else:
-            return False
-
-
-    cpdef bool top1isreference ( self ):
-        if self.v_ref_allele == self.v_top1allele:
-            return True
-        else:
-            return False
-
-    cpdef bool top2isreference ( self ):
-        if self.v_ref_allele == self.v_top2allele:
-            return True
-        else:
-            return False        
-        
-    cpdef str toVCF ( self ):
-        return "\t".join( ( self.v_ref_allele, self.v_alt_allele, "%d" % self.v_GQ, self.v_filter,
-                            "M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \
-                                (self.v_type, self.v_mutation_type, self.v_DPT, self.v_DPC, self.v_DP1T, self.v_top1allele,
-                                 self.v_DP2T, self.v_top2allele, self.v_DP1C, self.v_top1allele, self.v_DP2C, self.v_top2allele,
-                                 self.v_PLUS1T, self.v_PLUS2T, self.v_MINUS1T, self.v_MINUS2T,
-                                 self.v_deltaBIC,
-                                 self.v_BIC_homo_major, self.v_BIC_homo_minor, self.v_BIC_heter_noAS,self.v_BIC_heter_AS,
-                                 self.v_AR
-                                 ),
-                            "GT:DP:GQ:PL",
-                            "%s:%d:%d:%d,%d,%d" % (self.v_GT, self.v_DP, self.v_GQ, self.v_PL_00, self.v_PL_01, self.v_PL_11)
-                            ) )
-
-cdef class PeakVariants:
-    cdef:
-        str chrom
-        dict d_Variants
-        long start
-        long end
-        bytes refseq
-        
-
-    def __init__ ( self, str chrom, long start, long end, bytes s ):
-        self.chrom = chrom
-        self.d_Variants = {}
-        self.start = start
-        self.end = end
-        self.refseq = s
-
-    def __getstate__ ( self ):
-        return ( self.d_Variants, self.chrom )
-
-    def __setstate__ ( self, state ):
-        ( self.d_Variants, self.chrom ) = state
-
-    cpdef int n_variants ( self ):
-        return len(self.d_Variants)
-            
-    cpdef add_variant ( self, long p, Variant v ):
-        self.d_Variants[ p ] = v
-
-    cpdef bool has_indel ( self ):
-        cdef:
-            long p
-        for p in sorted( self.d_Variants.keys() ):
-            if self.d_Variants[ p ].is_indel():
-                return True
-        return False
-
-    cpdef bool has_refer_biased_01 ( self ):
-        cdef:
-            long p 
-        for p in sorted( self.d_Variants.keys() ):
-            if self.d_Variants[ p ].is_refer_biased_01():
-                return True
-        return False
-
-    cpdef list get_refer_biased_01s ( self ):
-        cdef:
-            list ret_poss = []
-            long p
-        for p in sorted( self.d_Variants.keys() ):
-            if self.d_Variants[ p ].is_refer_biased_01():
-                ret_poss.append( p )
-        return ret_poss
-
-    cpdef remove_variant ( self, long p ):
-        assert p in self.d_Variants
-        self.d_Variants.pop( p )
-    
-    cpdef replace_variant ( self, long p, Variant v ):
-        assert p in self.d_Variants
-        self.d_Variants[ p ] = v
-
-    cpdef fix_indels ( self ):
-        cdef:
-            long p0, p1, p
-
-        # merge continuous deletion
-        p0 = -1                           #start of deletion chunk
-        p1 = -1                           #end of deletion chunk
-        for p in sorted( self.d_Variants.keys() ):
-            if p == p1+1 and self.d_Variants[ p ].is_only_del() and self.d_Variants[ p0 ].is_only_del() :
-                # we keep p0, remove p, and add p's ref_allele to p0, keep other information as in p0
-                if self.d_Variants[ p0 ].top1isreference:
-                    if self.d_Variants[ p0 ]["top1allele"] == "*":
-                        self.d_Variants[ p0 ]["top1allele"] = ""
-                    self.d_Variants[ p0 ]["top1allele"] += self.d_Variants[ p ]["ref_allele"]
-                elif self.d_Variants[ p0 ].top2isreference:
-                    if self.d_Variants[ p0 ]["top2allele"] == "*":
-                        self.d_Variants[ p0 ]["top2allele"] = ""                    
-                    self.d_Variants[ p0 ]["top2allele"] += self.d_Variants[ p ]["ref_allele"]                    
-                self.d_Variants[ p0 ]["ref_allele"] += self.d_Variants[ p ]["ref_allele"]
-                self.d_Variants.pop ( p )
-                p1 = p
-            else:
-                p0 = p
-                p1 = p
-                
-        # fix deletion so that if the preceding base is 0/0 -- i.e. not in d_Variants, the reference base will be added.
-        for p in sorted( self.d_Variants.keys() ):
-            if self.d_Variants[ p ].is_only_del():
-                if not( ( p-1 ) in self.d_Variants ):
-                    if p > self.start: # now add the reference base 
-                        self.d_Variants[ p-1 ] = copy(self.d_Variants[ p ])
-                        rs = str(self.refseq)
-                        self.d_Variants[ p-1 ]["ref_allele"] = rs[ p - self.start ] + self.d_Variants[ p-1 ]["ref_allele"]
-                        self.d_Variants[ p-1 ]["alt_allele"] = rs[ p - self.start ]
-                        if self.d_Variants[ p ].top1isreference:
-                            self.d_Variants[ p-1 ]["top1allele"] = self.d_Variants[ p-1 ]["ref_allele"]
-                            self.d_Variants[ p-1 ]["top2allele"] = self.d_Variants[ p-1 ]["alt_allele"]
-                        elif self.d_Variants[ p ].top2isreference:
-                            self.d_Variants[ p-1 ]["top1allele"] = self.d_Variants[ p-1 ]["alt_allele"]                            
-                            self.d_Variants[ p-1 ]["top2allele"] = self.d_Variants[ p-1 ]["ref_allele"]
-                        self.d_Variants.pop( p )
-
-        # remove indel if a deletion is immediately following an
-        # insertion -- either a third genotype is found which is not
-        # allowed in this version of sapper, or problem caused by
-        # assembling in a simple repeat region.
-        
-        for p in sorted( self.d_Variants.keys() ):
-            if self.d_Variants[ p ].is_only_del():
-                if ( p-1 ) in self.d_Variants and self.d_Variants[p-1].is_only_insertion():
-                    self.d_Variants.pop( p )
-                    self.d_Variants.pop( p - 1 )
-        return
-                
-    cpdef str toVCF ( self ):
-        cdef:
-            long p
-            str res
-        res = ""
-        for p in sorted( self.d_Variants.keys() ):
-            res += "\t".join( ( self.chrom, str(p+1), ".", self.d_Variants[ p ].toVCF() ) ) + "\n"
-        return res
-            
-        
diff --git a/MACS3/Signal/Pileup.py b/MACS3/Signal/Pileup.py
index e5068c65..074d14a6 100644
--- a/MACS3/Signal/Pileup.py
+++ b/MACS3/Signal/Pileup.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-06 20:51:44 Tao Liu>
+# Time-stamp: <2024-10-22 10:35:32 Tao Liu>
 
 """Module Description: For pileup functions.
 
@@ -24,7 +24,6 @@
 # ------------------------------------
 import numpy as np
 import cython.cimports.numpy as cnp
-from cython.cimports.numpy import int32_t, float32_t
 from cython.cimports.cpython import bool
 
 # ------------------------------------
@@ -60,9 +59,7 @@ def fix_coordinates(poss: cnp.ndarray, rlength: cython.int) -> cnp.ndarray:
     """Fix the coordinates.
     """
     i: cython.long
-    ptr: cython.pointer(int32_t) = cython.cast(cython.pointer(int32_t), poss.data)  # pointer
-
-    #ptr = <int32_t *> poss.data
+    ptr: cython.pointer(cython.int) = cython.cast(cython.pointer(cython.int), poss.data)  # pointer
 
     # fix those negative coordinates
     for i in range(poss.shape[0]):
@@ -276,10 +273,10 @@ def se_all_in_one_pileup(plus_tags: cnp.ndarray,
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
     start_poss = np.concatenate((plus_tags-five_shift, minus_tags-three_shift))
     end_poss = np.concatenate((plus_tags+three_shift, minus_tags+five_shift))
@@ -294,14 +291,14 @@ def se_all_in_one_pileup(plus_tags: cnp.ndarray,
 
     lx = start_poss.shape[0]
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(2 * lx, dtype="i4")
     ret_v = np.zeros(2 * lx, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     tmp = [ret_p, ret_v]        # for (endpos,value)
 
@@ -421,19 +418,19 @@ def quick_pileup(start_poss: cnp.ndarray,
     tmp: list
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(l, dtype="i4")
     ret_v = np.zeros(l, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     tmp = [ret_p, ret_v]        # for (endpos,value)
 
@@ -530,23 +527,23 @@ def naive_quick_pileup(sorted_poss: cnp.ndarray, extension: int) -> list:
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    start_poss_ptr: cython.pointer(int32_t)
-    end_poss_ptr: cython.pointer(int32_t)
-    ret_p_ptr: cython.pointer(int32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    start_poss_ptr: cython.pointer(cython.int)
+    end_poss_ptr: cython.pointer(cython.int)
+    ret_p_ptr: cython.pointer(cython.int)
+    ret_v_ptr: cython.pointer(cython.float)
 
     start_poss = sorted_poss - extension
     start_poss[start_poss < 0] = 0
     end_poss = sorted_poss + extension
 
-    start_poss_ptr = cython.cast(cython.pointer(int32_t), start_poss.data)  # <int32_t *> start_poss.data
-    end_poss_ptr = cython.cast(cython.pointer(int32_t), end_poss.data)  # <int32_t *> end_poss.data
+    start_poss_ptr = cython.cast(cython.pointer(cython.int), start_poss.data)  # <int32_t *> start_poss.data
+    end_poss_ptr = cython.cast(cython.pointer(cython.int), end_poss.data)  # <int32_t *> end_poss.data
 
     ret_p = np.zeros(2*l, dtype="i4")
     ret_v = np.zeros(2*l, dtype="f4")
 
-    ret_p_ptr = cython.cast(cython.pointer(int32_t), ret_p.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_p_ptr = cython.cast(cython.pointer(cython.int), ret_p.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     if l == 0:
         raise Exception("length is 0")
@@ -627,7 +624,7 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
 
     available operations are 'max', 'min', and 'mean'
     """
-    #pre_p: cython.int
+    # pre_p: cython.int
 
     l1: cython.long
     l2: cython.long
@@ -643,12 +640,12 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
     ret_v: cnp.ndarray
 
     # pointers are used for numpy arrays
-    a1_pos_ptr: cython.pointer(int32_t)
-    a2_pos_ptr: cython.pointer(int32_t)
-    ret_pos_ptr: cython.pointer(int32_t)
-    a1_v_ptr: cython.pointer(float32_t)
-    a2_v_ptr: cython.pointer(float32_t)
-    ret_v_ptr: cython.pointer(float32_t)
+    a1_pos_ptr: cython.pointer(cython.int)
+    a2_pos_ptr: cython.pointer(cython.int)
+    ret_pos_ptr: cython.pointer(cython.int)
+    a1_v_ptr: cython.pointer(cython.float)
+    a2_v_ptr: cython.pointer(cython.float)
+    ret_v_ptr: cython.pointer(cython.float)
 
     if func == "max":
         f = max
@@ -661,15 +658,15 @@ def over_two_pv_array(pv_array1: list, pv_array2: list, func: str = "max") -> li
 
     [a1_pos, a1_v] = pv_array1
     [a2_pos, a2_v] = pv_array2
-    ret_pos = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="int32")
-    ret_v = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="float32")
-
-    a1_pos_ptr = cython.cast(cython.pointer(int32_t), a1_pos.data)
-    a1_v_ptr = cython.cast(cython.pointer(float32_t), a1_v.data)
-    a2_pos_ptr = cython.cast(cython.pointer(int32_t), a2_pos.data)
-    a2_v_ptr = cython.cast(cython.pointer(float32_t), a2_v.data)
-    ret_pos_ptr = cython.cast(cython.pointer(int32_t), ret_pos.data)
-    ret_v_ptr = cython.cast(cython.pointer(float32_t), ret_v.data)
+    ret_pos = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="i4")
+    ret_v = np.zeros(a1_pos.shape[0] + a2_pos.shape[0], dtype="f4")
+
+    a1_pos_ptr = cython.cast(cython.pointer(cython.int), a1_pos.data)
+    a1_v_ptr = cython.cast(cython.pointer(cython.float), a1_v.data)
+    a2_pos_ptr = cython.cast(cython.pointer(cython.int), a2_pos.data)
+    a2_v_ptr = cython.cast(cython.pointer(cython.float), a2_v.data)
+    ret_pos_ptr = cython.cast(cython.pointer(cython.int), ret_pos.data)
+    ret_v_ptr = cython.cast(cython.pointer(cython.float), ret_v.data)
 
     l1 = a1_pos.shape[0]
     l2 = a2_pos.shape[0]
diff --git a/MACS3/Signal/PileupV2.py b/MACS3/Signal/PileupV2.py
index 62f065f4..299ecf6e 100644
--- a/MACS3/Signal/PileupV2.py
+++ b/MACS3/Signal/PileupV2.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-10-04 23:59:48 Tao Liu>
+# Time-stamp: <2024-10-14 21:19:00 Tao Liu>
 
 """Module Description:
 
@@ -34,33 +34,19 @@
 for i from 0 to 2N in PV_sorted:
     1: z = z + v_i
     2: e = p_i
-    3: save the pileup from position s to e is z --  in bedGraph style is to only save (e, z)
+    3: save the pileup from position s to e is z,
+       in bedGraph style is to only save (e, z)
     4: s = e
 
 This code is free software; you can redistribute it and/or modify it
 under the terms of the BSD License (see the file LICENSE included with
 the distribution).
-
 """
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-
-# ------------------------------------
-# Other modules
 # ------------------------------------
 import numpy as np
 import cython
 import cython.cimports.numpy as cnp
-from cython.cimports.numpy import int32_t, float32_t, uint64_t
 
-# ------------------------------------
-# C lib
 # ------------------------------------
 # from cython.cimports.libc.stdlib import malloc, free, qsort
 
@@ -70,7 +56,7 @@
 
 
 @cython.ccall
-def mapping_function_always_1(L: int32_t, R: int32_t) -> float32_t:
+def mapping_function_always_1(L: cython.int, R: cython.int) -> cython.float:
     # always return 1, useful while the weight is already 1, or in
     # case of simply piling up fragments for coverage.
     return 1.0
@@ -86,81 +72,6 @@ def clean_up_ndarray(x: cnp.ndarray):
     x.resize(0, refcheck=False)
     return
 
-# ------------------------------------
-# public python functions
-# ------------------------------------
-
-
-@cython.ccall
-def pileup_from_LR_hmmratac(LR_array: cnp.ndarray,
-                            mapping_dict: dict) -> cnp.ndarray:
-    # this function is specifically designed for piling up fragments
-    # for `hmmratac`.
-    #
-    # As for `hmmratac`, the weight depends on the length of the
-    # fragment, aka, value of R-L. Therefore, we need a mapping_dict
-    # for mapping length to weight.
-    l_LR: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
-    PV: cnp.ndarray
-    pileup: cnp.ndarray
-
-    l_LR = LR_array.shape[0]
-    l_PV = 2 * l_LR
-    PV = np.zeros(shape=l_PV, dtype=[('p', 'uint32'), ('v', 'float32')])
-    for i in range(l_LR):
-        (L, R) = LR_array[i]
-        PV[i*2] = (L, mapping_dict[R - L])
-        PV[i*2 + 1] = (R, -1 * mapping_dict[R - L])
-    PV.sort(order='p')
-    pileup = pileup_PV(PV)
-    clean_up_ndarray(PV)
-    return pileup
-
-
-@cython.ccall
-def pileup_from_LR(LR_array: cnp.ndarray,
-                   mapping_func=mapping_function_always_1) -> cnp.ndarray:
-    """This function will pile up the ndarray containing left and
-    right positions, which is typically from PETrackI object. It's
-    useful when generating the pileup of a single chromosome is
-    needed.
-
-    User needs to provide a numpy array of left and right positions,
-    with dtype=[('l','int32'),('r','int32')]. User also needs to
-    provide a mapping function to map the left and right position to
-    certain weight.
-
-    """
-    PV_array: cnp.ndarray
-    pileup: cnp.ndarray
-
-    PV_array = make_PV_from_LR(LR_array, mapping_func=mapping_func)
-    pileup = pileup_PV(PV_array)
-    clean_up_ndarray(PV_array)
-    return pileup
-
-
-@cython.ccall
-def pileup_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
-                   extsize: cython.int) -> cnp.ndarray:
-    """This function will pile up the ndarray containing plus
-    (positive) and minus (negative) positions of all reads, which is
-    typically from FWTrackI object. It's useful when generating the
-    pileup of a single chromosome is needed.
-
-    """
-    PV_array: cnp.ndarray
-    pileup: cnp.ndarray
-
-    PV_array = make_PV_from_PN(P_array, N_array, extsize)
-    pileup = pileup_PV(PV_array)
-    clean_up_ndarray(PV_array)
-    return pileup
-
 
 @cython.cfunc
 def make_PV_from_LR(LR_array: cnp.ndarray,
@@ -170,16 +81,16 @@ def make_PV_from_LR(LR_array: cnp.ndarray,
     `mapping_func( L, R )` or simply 1 if mapping_func is the default.
 
     LR array is an np.ndarray as with dtype
-    [('l','int32'),('r','int32')] with length of N
+    [('l','i4'),('r','i4')] with length of N
 
     PV array is an np.ndarray with
-    dtype=[('p','uint32'),('v','float32')] with length of 2N
+    dtype=[('p','u4'),('v','f4')] with length of 2N
     """
-    l_LR: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
+    l_LR: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
     PV: cnp.ndarray
 
     l_LR = LR_array.shape[0]
@@ -193,6 +104,38 @@ def make_PV_from_LR(LR_array: cnp.ndarray,
     return PV
 
 
+@cython.cfunc
+def make_PV_from_LRC(LRC_array: cnp.ndarray,
+                     mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """Make sorted PV array from a LR array for certain chromosome in a
+    PETrackII object. The V/weight will be assigned as
+    `mapping_func( L, R )` or simply 1 if mapping_func is the default.
+
+    LRC array is an np.ndarray as with dtype
+    [('l','i4'),('r','i4'),('c','u1')] with length of N
+
+    PV array is an np.ndarray with
+    dtype=[('p','u4'),('v','f4')] with length of 2N
+    """
+    l_LRC: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
+    C: cython.uchar
+    PV: cnp.ndarray
+
+    l_LRC = LRC_array.shape[0]
+    l_PV = 2 * l_LRC
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
+    for i in range(l_LRC):
+        (L, R, C) = LRC_array[i]
+        PV[i*2] = (L, C*mapping_func(L, R))
+        PV[i*2 + 1] = (R, -1.0 * C * mapping_func(L, R))
+    PV.sort(order='p')
+    return PV
+
+
 @cython.cfunc
 def make_PV_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
                     extsize: cython.int) -> cnp.ndarray:
@@ -202,22 +145,22 @@ def make_PV_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
     in this case since all positions should be extended with a fixed
     'extsize'.
 
-    P_array or N_array is an np.ndarray with dtype='int32'
+    P_array or N_array is an np.ndarray with dtype='i4'
 
     PV array is an np.ndarray with
-    dtype=[('p','uint32'),('v','float32')] with length of 2N
+    dtype=[('p','u4'),('v','f4')] with length of 2N
     """
-    l_PN: uint64_t
-    l_PV: uint64_t
-    i: uint64_t
-    L: int32_t
-    R: int32_t
+    l_PN: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
     PV: cnp.ndarray
 
     l_PN = P_array.shape[0]
     assert l_PN == N_array.shape[0]
     l_PV = 4 * l_PN
-    PV = np.zeros(shape=l_PV, dtype=[('p', 'uint32'), ('v', 'float32')])
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
     for i in range(l_PN):
         L = P_array[i]
         R = L + extsize
@@ -246,24 +189,29 @@ def pileup_PV(PV_array: cnp.ndarray) -> cnp.ndarray:
         save the pileup from position s to e is z --  in bedGraph style is to only save (e, z)
         s = e
     """
-    z: float32_t
-    v: float32_t
-    pre_z: float32_t
-    s: uint64_t
-    e: uint64_t
-    i: uint64_t
-    c: uint64_t
-    pileup_PV: cnp.ndarray  # this is in bedGraph style as in Pileup.pyx, p is the end of a region, and v is the pileup value
+    z: cython.float
+    v: cython.float
+    pre_z: cython.float
+    s: cython.ulong
+    e: cython.ulong
+    i: cython.ulong
+    c: cython.ulong
+    # this is in bedGraph style as in Pileup.pyx, p is the end of a
+    # region, and v is the pileup value. It's 
+    pileup_PV: cnp.ndarray
     z = 0
     pre_z = -10000
     s = 0
-    pileup_PV = np.zeros(shape=PV_array.shape[0], dtype=[('p', 'uint32'), ('v', 'float32')])
+    pileup_PV = np.zeros(shape=PV_array.shape[0], dtype=[('p', 'u4'),
+                                                         ('v', 'f4')])
     c = 0
     for i in range(PV_array.shape[0]):
         e = PV_array[i]['p']
         v = PV_array[i]['v']
-        if e != s:              # make sure only to record the final value for the same position
-            if z == pre_z:      # merge the p-v pair with the previous pair if the same v is found
+        # make sure only to record the final value for the same position
+        if e != s:
+            # merge the p-v pair with the previous pair if the same v is found
+            if z == pre_z:
                 pileup_PV[c-1]['p'] = e
             else:
                 pileup_PV[c] = (e, z)
@@ -274,3 +222,102 @@ def pileup_PV(PV_array: cnp.ndarray) -> cnp.ndarray:
     pileup_PV.resize(c, refcheck=False)
     # assert z == 0
     return pileup_PV
+
+# ------------------------------------
+# public python functions
+# ------------------------------------
+
+
+@cython.ccall
+def pileup_from_LR_hmmratac(LR_array: cnp.ndarray,
+                            mapping_dict: dict) -> cnp.ndarray:
+    # this function is specifically designed for piling up fragments
+    # for `hmmratac`.
+    #
+    # As for `hmmratac`, the weight depends on the length of the
+    # fragment, aka, value of R-L. Therefore, we need a mapping_dict
+    # for mapping length to weight.
+    l_LR: cython.ulong
+    l_PV: cython.ulong
+    i: cython.ulong
+    L: cython.int
+    R: cython.int
+    PV: cnp.ndarray
+    pileup: cnp.ndarray
+
+    l_LR = LR_array.shape[0]
+    l_PV = 2 * l_LR
+    PV = np.zeros(shape=l_PV, dtype=[('p', 'u4'), ('v', 'f4')])
+    for i in range(l_LR):
+        (L, R) = LR_array[i]
+        PV[i*2] = (L, mapping_dict[R - L])
+        PV[i*2 + 1] = (R, -1 * mapping_dict[R - L])
+    PV.sort(order='p')
+    pileup = pileup_PV(PV)
+    clean_up_ndarray(PV)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_LR(LR_array: cnp.ndarray,
+                   mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """This function will pile up the ndarray containing left and
+    right positions, which is typically from PETrackI object. It's
+    useful when generating the pileup of a single chromosome is
+    needed.
+
+    User needs to provide a numpy array of left and right positions,
+    with dtype=[('l','i4'),('r','i4')]. User also needs to
+    provide a mapping function to map the left and right position to
+    certain weight.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_LR(LR_array, mapping_func=mapping_func)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_LRC(LRC_array: cnp.ndarray,
+                    mapping_func=mapping_function_always_1) -> cnp.ndarray:
+    """This function will pile up the ndarray containing left and
+    right positions and the counts, which is typically from PETrackII
+    object. It's useful when generating the pileup of a single
+    chromosome is needed.
+
+    User needs to provide a numpy array of left and right positions
+    and the counts, with
+    dtype=[('l','i4'),('r','i4'),('c','u1')]. User also needs to
+    provide a mapping function to map the left and right position to
+    certain weight.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_LRC(LRC_array, mapping_func=mapping_func)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
+
+
+@cython.ccall
+def pileup_from_PN(P_array: cnp.ndarray, N_array: cnp.ndarray,
+                   extsize: cython.int) -> cnp.ndarray:
+    """This function will pile up the ndarray containing plus
+    (positive) and minus (negative) positions of all reads, which is
+    typically from FWTrackI object. It's useful when generating the
+    pileup of a single chromosome is needed.
+
+    """
+    PV_array: cnp.ndarray
+    pileup: cnp.ndarray
+
+    PV_array = make_PV_from_PN(P_array, N_array, extsize)
+    pileup = pileup_PV(PV_array)
+    clean_up_ndarray(PV_array)
+    return pileup
diff --git a/MACS3/Signal/PosReadsInfo.py b/MACS3/Signal/PosReadsInfo.py
new file mode 100644
index 00000000..43b52538
--- /dev/null
+++ b/MACS3/Signal/PosReadsInfo.py
@@ -0,0 +1,670 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-22 16:59:53 Tao Liu>
+
+"""Module for SAPPER PosReadsInfo class.
+
+Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+
+@status:  experimental
+@version: $Revision$
+@author:  Tao Liu
+@contact: tliu4@buffalo.edu
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+from MACS3.Signal.VariantStat import (CalModel_Homo,
+                                      CalModel_Heter_noAS,
+                                      CalModel_Heter_AS)
+# calculate_GQ,
+# calculate_GQ_heterASsig)
+from MACS3.Signal.Prob import binomial_cdf
+from MACS3.Signal.PeakVariants import Variant
+
+import cython
+import numpy as np
+import cython.cimports.numpy as cnp
+from cython.cimports.cpython import bool
+
+LN10 = 2.3025850929940458
+
+# ------------------------------------
+# constants
+# ------------------------------------
+__version__ = "Parser $Revision$"
+__author__ = "Tao Liu <tliu4@buffalo.edu>"
+__doc__ = "All Parser classes"
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class PosReadsInfo:
+    ref_pos: cython.long
+    ref_allele: cython.bytes
+    alt_allele: cython.bytes
+    filterout: bool          # if true, do not output
+    bq_set_T: dict       # {A:[], C:[], G:[], T:[], N:[]} for treatment
+    bq_set_C: dict
+    n_reads_T: dict      # {A:[], C:[], G:[], T:[], N:[]} for treatment
+    n_reads_C: dict
+    n_reads: dict
+    n_strand: list  # [{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only
+    n_tips: dict  # count of nt appearing at tips
+    top1allele: cython.bytes
+    top2allele: cython.bytes
+    top12alleles_ratio: cython.float
+    lnL_homo_major: cython.double
+    lnL_heter_AS: cython.double
+    lnL_heter_noAS: cython.double
+    lnL_homo_minor: cython.double
+    BIC_homo_major: cython.double
+    BIC_heter_AS: cython.double
+    BIC_heter_noAS: cython.double
+    BIC_homo_minor: cython.double
+    PL_00: cython.double
+    PL_01: cython.double
+    PL_11: cython.double
+    deltaBIC: cython.double
+    heter_noAS_kc: cython.int
+    heter_noAS_ki: cython.int
+    heter_AS_kc: cython.int
+    heter_AS_ki: cython.int
+    heter_AS_alleleratio: cython.double
+    GQ_homo_major: cython.int
+    GQ_heter_noAS: cython.int
+    GQ_heter_AS: cython.int  # phred scale of prob by standard formular
+    GQ_heter_ASsig: cython.int  # phred scale of prob, to measure the difference between AS and noAS
+    GQ: cython.double
+    GT: str
+    type: str
+    mutation_type: str       # SNV or Insertion or Deletion
+    hasfermiinfor: bool  # if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false;
+    fermiNTs: bytearray
+
+    def __cinit__(self):
+        self.filterout = False
+        self.GQ = 0
+        self.GT = "unsure"
+        self.alt_allele = b'.'
+
+    def __init__(self,
+                 ref_pos: cython.long,
+                 ref_allele: cython.bytes):
+        self.ref_pos = ref_pos
+        self.ref_allele = ref_allele
+        self.bq_set_T = {ref_allele: [], b'A': [], b'C': [], b'G': [], b'T': [], b'N': [], b'*': []}
+        self.bq_set_C = {ref_allele: [], b'A': [], b'C': [], b'G': [], b'T': [], b'N': [], b'*': []}
+        self.n_reads_T = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}
+        self.n_reads_C = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}
+        self.n_reads = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}
+        self.n_strand = [{ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0},
+                         {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}]
+        self.n_tips = {ref_allele: 0, b'A': 0, b'C': 0, b'G': 0, b'T': 0, b'N': 0, b'*': 0}
+
+    def __getstate__(self):
+        return (self.ref_pos,
+                self.ref_allele,
+                self.alt_allele,
+                self.filterout,
+                self.bq_set_T,
+                self.bq_set_C,
+                self.n_reads_T,
+                self.n_reads_C,
+                self.n_reads,
+                self.n_strand,
+                self.n_tips,
+                self.top1allele,
+                self.top2allele,
+                self.top12alleles_ratio,
+                self.lnL_homo_major,
+                self.lnL_heter_AS,
+                self.lnL_heter_noAS,
+                self.lnL_homo_minor,
+                self.BIC_homo_major,
+                self.BIC_heter_AS,
+                self.BIC_heter_noAS,
+                self.BIC_homo_minor,
+                self.heter_noAS_kc,
+                self.heter_noAS_ki,
+                self.heter_AS_kc,
+                self.heter_AS_ki,
+                self.heter_AS_alleleratio,
+                self.GQ_homo_major,
+                self.GQ_heter_noAS,
+                self.GQ_heter_AS,
+                self.GQ_heter_ASsig,
+                self.GQ,
+                self.GT,
+                self.type,
+                self.hasfermiinfor,
+                self.fermiNTs)
+
+    def __setstate__(self, state):
+        (self.ref_pos,
+         self.ref_allele,
+         self.alt_allele,
+         self.filterout,
+         self.bq_set_T,
+         self.bq_set_C,
+         self.n_reads_T,
+         self.n_reads_C,
+         self.n_reads,
+         self.n_strand,
+         self.n_tips,
+         self.top1allele,
+         self.top2allele,
+         self.top12alleles_ratio,
+         self.lnL_homo_major,
+         self.lnL_heter_AS,
+         self.lnL_heter_noAS,
+         self.lnL_homo_minor,
+         self.BIC_homo_major,
+         self.BIC_heter_AS,
+         self.BIC_heter_noAS,
+         self.BIC_homo_minor,
+         self.heter_noAS_kc,
+         self.heter_noAS_ki,
+         self.heter_AS_kc,
+         self.heter_AS_ki,
+         self.heter_AS_alleleratio,
+         self.GQ_homo_major,
+         self.GQ_heter_noAS,
+         self.GQ_heter_AS,
+         self.GQ_heter_ASsig,
+         self.GQ,
+         self.GT,
+         self.type,
+         self.hasfermiinfor,
+         self.fermiNTs) = state
+
+    @cython.ccall
+    def filterflag(self) -> bool:
+        return self.filterout
+
+    @cython.ccall
+    def apply_GQ_cutoff(self,
+                        min_homo_GQ: cython.int = 50,
+                        min_heter_GQ: cython.int = 100):
+        if self.filterout:
+            return
+        if self.type.startswith('homo') and self.GQ < min_homo_GQ:
+            self.filterout = True
+        elif self.type.startswith('heter') and self.GQ < min_heter_GQ:
+            self.filterout = True
+        return
+
+    @cython.ccall
+    def apply_deltaBIC_cutoff(self,
+                              min_delta_BIC: cython.float = 10):
+        if self.filterout:
+            return
+        if self.deltaBIC < min_delta_BIC:
+            self.filterout = True
+        return
+
+    @cython.ccall
+    def add_T(self,
+              read_index: cython.int,
+              read_allele: cython.bytes,
+              read_bq: cython.int,
+              strand: cython.int,
+              tip: bool,
+              Q: cython.int = 20):
+        """ Strand 0: plus, 1: minus
+
+        Q is the quality cutoff. By default, only consider Q20 or read_bq > 20.
+        """
+        if read_bq <= Q:
+            return
+        if not self.n_reads.has_key(read_allele):
+            self.bq_set_T[read_allele] = []
+            self.bq_set_C[read_allele] = []
+            self.n_reads_T[read_allele] = 0
+            self.n_reads_C[read_allele] = 0
+            self.n_reads[read_allele] = 0
+            self.n_strand[0][read_allele] = 0
+            self.n_strand[1][read_allele] = 0
+            self.n_tips[read_allele] = 0
+        self.bq_set_T[read_allele].append(read_bq)
+        self.n_reads_T[read_allele] += 1
+        self.n_reads[read_allele] += 1
+        self.n_strand[strand][read_allele] += 1
+        if tip:
+            self.n_tips[read_allele] += 1
+
+    @cython.ccall
+    def add_C(self,
+              read_index: cython.int,
+              read_allele: cython.bytes,
+              read_bq: cython.int,
+              strand: cython.int,
+              Q: cython.int = 20):
+        if read_bq <= Q:
+            return
+        if not self.n_reads.has_key(read_allele):
+            self.bq_set_T[read_allele] = []
+            self.bq_set_C[read_allele] = []
+            self.n_reads_T[read_allele] = 0
+            self.n_reads_C[read_allele] = 0
+            self.n_reads[read_allele] = 0
+            self.n_strand[0][read_allele] = 0
+            self.n_strand[1][read_allele] = 0
+            self.n_tips[read_allele] = 0
+        self.bq_set_C[read_allele].append(read_bq)
+        self.n_reads_C[read_allele] += 1
+        self.n_reads[read_allele] += 1
+
+    @cython.ccall
+    def raw_read_depth(self,
+                       opt: str = "all") -> cython.int:
+        if opt == "all":
+            return sum(self.n_reads.values())
+        elif opt == "T":
+            return sum(self.n_reads_T.values())
+        elif opt == "C":
+            return sum(self.n_reads_C.values())
+        else:
+            raise Exception("opt should be either 'all', 'T' or 'C'.")
+
+    @cython.ccall
+    def update_top_alleles(self,
+                           min_top12alleles_ratio: cython.float = 0.8,
+                           min_altallele_count: cython.int = 2,
+                           max_allowed_ar: cython.float = 0.95):
+        """Identify top1 and top2 NT.  the ratio of (top1+top2)/total
+        """
+        [self.top1allele, self.top2allele] = sorted(self.n_reads,
+                                                    key=self.n_reads_T.get,
+                                                    reverse=True)[:2]
+
+        # if top2 allele count in ChIP is lower than
+        # min_altallele_count, or when allele ratio top1/(top1+top2)
+        # is larger than max_allowed_ar in ChIP, we won't consider
+        # this allele at all.  we set values of top2 allele in
+        # dictionaries to [] and ignore top2 allele entirely.
+        # max(self.n_strand[0][self.top2allele], self.n_strand[1][self.top2allele]) < min_altallele_count
+        # if self.ref_pos == 52608504:
+        #    prself: cython.int.ref_pos, self.n_reads_T[self.top1allele], self.n_reads_T[self.top2allele], self.n_reads_C[self.top1allele], self.n_reads_C[self.top2allele]
+        if self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele] == 0:
+            self.filterout = True
+            return
+
+        if (len(self.top1allele) == 1 and len(self.top2allele) == 1) and (self.top2allele != self.ref_allele and ((self.n_reads_T[self.top2allele] - self.n_tips[self.top2allele]) < min_altallele_count) or self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele]) > max_allowed_ar):
+            self.bq_set_T[self.top2allele] = []
+            self.bq_set_C[self.top2allele] = []
+            self.n_reads_T[self.top2allele] = 0
+            self.n_reads_C[self.top2allele] = 0
+            self.n_reads[self.top2allele] = 0
+            self.n_tips[self.top2allele] = 0
+            if (self.top1allele != self.ref_allele and (self.n_reads_T[self.top1allele] - self.n_tips[self.top1allele]) < min_altallele_count):
+                self.bq_set_T[self.top1allele] = []
+                self.bq_set_C[self.top1allele] = []
+                self.n_reads_T[self.top1allele] = 0
+                self.n_reads_C[self.top1allele] = 0
+                self.n_reads[self.top1allele] = 0
+                self.n_tips[self.top1allele] = 0
+
+        if self.n_reads_T[self.top1allele] + self.n_reads_T[self.top2allele] == 0:
+            self.filterout = True
+            return
+
+        self.top12alleles_ratio = (self.n_reads[self.top1allele] + self.n_reads[self.top2allele]) / sum(self.n_reads.values())
+        if self.top12alleles_ratio < min_top12alleles_ratio:
+            self.filterout = True
+            return
+
+        if self.top1allele == self.ref_allele and self.n_reads[self.top2allele] == 0:
+            # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0
+            self.type = "homo_ref"
+            self.filterout = True
+            return
+        return
+
+    @cython.ccall
+    def top12alleles(self):
+        print(self.ref_pos, self.ref_allele)
+        print("Top1allele", self.top1allele, "Treatment",
+              self.bq_set_T[self.top1allele], "Control",
+              self.bq_set_C[self.top1allele])
+        print("Top2allele", self.top2allele, "Treatment",
+              self.bq_set_T[self.top2allele], "Control",
+              self.bq_set_C[self.top2allele])
+    
+    @cython.ccall
+    def call_GT(self, max_allowed_ar: cython.float = 0.99):
+        """Require update_top_alleles being called.
+        """
+        top1_bq_T: cnp.ndarray(cython.int, ndim=1)
+        top2_bq_T: cnp.ndarray(cython.int, ndim=1)
+        top1_bq_C: cnp.ndarray(cython.int, ndim=1)
+        top2_bq_C: cnp.ndarray(cython.int, ndim=1)
+        tmp_mutation_type: list
+        tmp_alt: cython.bytes
+
+        if self.filterout:
+            return
+
+        top1_bq_T = np.array(self.bq_set_T[self.top1allele], dtype="i4")
+        top2_bq_T = np.array(self.bq_set_T[self.top2allele], dtype="i4")
+        top1_bq_C = np.array(self.bq_set_C[self.top1allele], dtype="i4")
+        top2_bq_C = np.array(self.bq_set_C[self.top2allele], dtype="i4")
+        (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo(top1_bq_T,
+                                                                   top1_bq_C,
+                                                                   top2_bq_T,
+                                                                   top2_bq_C)
+        (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo(top2_bq_T,
+                                                                   top2_bq_C,
+                                                                   top1_bq_T,
+                                                                   top1_bq_C)
+        (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS(top1_bq_T,
+                                                                         top1_bq_C,
+                                                                         top2_bq_T,
+                                                                         top2_bq_C)
+        (self.lnL_heter_AS, self.BIC_heter_AS) = CalModel_Heter_AS(top1_bq_T,
+                                                                   top1_bq_C,
+                                                                   top2_bq_T,
+                                                                   top2_bq_C,
+                                                                   max_allowed_ar)
+
+        # if self.ref_pos == 71078525:
+        #    print "---"
+        #    prlen: cython.int(top1_bq_T), len(top1_bq_C), len(top2_bq_T), len(top2_bq_C)
+        #    prself: cython.int.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS
+        #    prself: cython.int.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS
+
+        if self.top1allele != self.ref_allele and self.n_reads[self.top2allele] == 0:
+            # in this case, there is no top2 nt (or socalled minor
+            # allele) in either treatment or control, we should assume
+            # it's a 1/1 genotype. We will take 1/1 if it passes BIC
+            # test (deltaBIC >=2), and will skip this loci if it can't
+            # pass the test.
+
+            self.deltaBIC = min(self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor) - self.BIC_homo_major
+            if self.deltaBIC < 2:
+                self.filterout = True
+                return
+
+            self.type = "homo"
+            self.GT = "1/1"
+
+            self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
+            self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10
+            self.PL_11 = -10.0 * self.lnL_homo_major / LN10
+
+            self.PL_00 = max(0, self.PL_00 - self.PL_11)
+            self.PL_01 = max(0, self.PL_01 - self.PL_11)
+            self.PL_11 = 0
+
+            self.GQ = min(self.PL_00, self.PL_01)
+            self.alt_allele = self.top1allele
+        else:
+            # assign GQ, GT, and type
+            if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS:
+                self.type = "homo"
+                self.deltaBIC = min(self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor) - self.BIC_homo_major
+                self.GT = "1/1"
+                self.alt_allele = self.top1allele
+
+                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
+                self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10
+                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
+
+                self.PL_00 = self.PL_00 - self.PL_11
+                self.PL_01 = self.PL_01 - self.PL_11
+                self.PL_11 = 0
+
+                self.GQ = min(self.PL_00, self.PL_01)
+
+            elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS:
+                self.type = "heter_noAS"
+                self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - self.BIC_heter_noAS
+
+                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
+                self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10
+                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
+
+                self.PL_00 = self.PL_00 - self.PL_01
+                self.PL_11 = self.PL_11 - self.PL_01
+                self.PL_01 = 0
+
+                self.GQ = min(self.PL_00, self.PL_11)
+
+            elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS:
+                self.type = "heter_AS"
+                self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - self.BIC_heter_AS
+
+                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
+                self.PL_01 = -10.0 * self.lnL_heter_AS / LN10
+                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
+
+                self.PL_00 = self.PL_00 - self.PL_01
+                self.PL_11 = self.PL_11 - self.PL_01
+                self.PL_01 = 0
+
+                self.GQ = min(self.PL_00, self.PL_11)
+
+            elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor:
+                # can't decide if it's noAS or AS
+                self.type = "heter_unsure"
+                self.deltaBIC = min(self.BIC_homo_major, self.BIC_homo_minor) - max(self.BIC_heter_AS, self.BIC_heter_noAS)
+
+                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
+                self.PL_01 = -10.0 * max(self.lnL_heter_noAS, self.lnL_heter_AS) / LN10
+                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
+
+                self.PL_00 = self.PL_00 - self.PL_01
+                self.PL_11 = self.PL_11 - self.PL_01
+                self.PL_01 = 0
+
+                self.GQ = min(self.PL_00, self.PL_11)
+
+            elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS:
+                self.type = "homo_ref"
+                # we do not calculate GQ if type is homo_ref
+                self.GT = "0/0"
+                self.filterout = True
+            else:
+                self.type = "unsure"
+                self.filterout = True
+
+            if self.type.startswith("heter"):
+                if self.ref_allele == self.top1allele:
+                    self.alt_allele = self.top2allele
+                    self.GT = "0/1"
+                elif self.ref_allele == self.top2allele:
+                    self.alt_allele = self.top1allele
+                    self.GT = "0/1"
+                else:
+                    self.alt_allele = self.top1allele+b','+self.top2allele
+                    self.GT = "1/2"
+
+        tmp_mutation_type = []
+        for tmp_alt in self.alt_allele.split(b','):
+            if tmp_alt == b'*':
+                tmp_mutation_type.append("Deletion")
+            elif len(tmp_alt) > 1:
+                tmp_mutation_type.append("Insertion")
+            else:
+                tmp_mutation_type.append("SNV")
+        self.mutation_type = ",".join(tmp_mutation_type)
+        return
+
+    @cython.cfunc
+    def SB_score_ChIP(self,
+                      a: cython.int,
+                      b: cython.int,
+                      c: cython.int,
+                      d: cython.int) -> cython.float:
+        """ calculate score for filtering variants with strange strand biases.
+
+        a: top1/major allele plus strand
+        b: top2/minor allele plus strand
+        c: top1/major allele minus strand
+        d: top2/minor allele minus strand
+
+        Return a value: cython.float so that if this value >= 1, the variant will be filtered out.
+        """
+        p1_l: cython.double
+        p1_r: cython.double
+        p2_l: cython.double
+        p2_r: cython.double
+
+        if a + b == 0 or c + d == 0:
+            # if major allele and minor allele both bias to the same strand, allow it
+            return 0.0
+
+        # Rule:
+        # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
+        # or there is no significant bias (0.5) in top2 allele.
+
+        # pra: cython.int, b, c, d
+        p1_l = binomial_cdf(a, (a+c), 0.5, lower=True)      # alternative: less than 0.5
+        p1_r = binomial_cdf(c, (a+c), 0.5, lower=True)                   # greater than 0.5
+        p2_l = binomial_cdf(b, (b+d), 0.5, lower=True)      # alternative: less than 0.5
+        p2_r = binomial_cdf(d, (b+d), 0.5, lower=True)                   # greater than 0.5
+        # prp1_l: cython.int, p1_r, p2_l, p2_r
+
+        if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
+            # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
+            return 1.0
+        else:
+            # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs
+            # if (b<=2 and d==0 or b==0 and d<=2):
+            #    return 1
+            # can't decide
+            return 0.0
+
+    @cython.cfunc
+    def SB_score_ATAC(self,
+                      a: cython.int,
+                      b: cython.int,
+                      c: cython.int,
+                      d: cython.int) -> cython.float:
+        """ calculate score for filtering variants with strange strand biases.
+
+        ATAC-seq version
+
+        a: top1/major allele plus strand
+        b: top2/minor allele plus strand
+        c: top1/major allele minus strand
+        d: top2/minor allele minus strand
+
+        Return a value: cython.float so that if this value >= 1, the variant will be filtered out.
+        """
+        p1_l: cython.double
+        p1_r: cython.double
+        p2_l: cython.double
+        p2_r: cython.double
+
+        if a+b == 0 or c+d == 0:
+            # if major allele and minor allele both bias to the same strand, allow it
+            return 0.0
+
+        # Rule:
+        # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
+        # or there is no significant bias (0.5) in top2 allele.
+        # pra: cython.int, b, c, d
+        p1_l = binomial_cdf(a, (a+c), 0.5, lower=True)      # alternative: less than 0.5
+        p1_r = binomial_cdf(c, (a+c), 0.5, lower=True)      #              greater than 0.5
+        p2_l = binomial_cdf(b, (b+d), 0.5, lower=True)      # alternative: less than 0.5
+        p2_r = binomial_cdf(d, (b+d), 0.5, lower=True)      #              greater than 0.5
+        # prp1_l: cython.int, p1_r, p2_l, p2_r
+
+        if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
+            # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
+            return 1.0
+        else:
+            # can't decide
+            return 0.0
+
+    @cython.ccall
+    def to_vcf(self) -> str:
+        """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns.
+        """
+        vcf_ref: str
+        vcf_alt: str
+        vcf_qual: str
+        vcf_filter: str
+        vcf_info: str
+        vcf_format: str
+        vcf_sample: str
+
+        vcf_ref = self.ref_allele.decode()
+        vcf_alt = self.alt_allele.decode()
+        vcf_qual = "%d" % self.GQ
+        vcf_filter = "."
+        vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" %
+                    (self.type.encode(),
+                     self.mutation_type.encode(),
+                     sum(self.n_reads_T.values()),
+                     sum(self.n_reads_C.values()),
+                     self.n_reads_T[self.top1allele],
+                     self.top1allele,
+                     self.n_reads_T[self.top2allele],
+                     self.top2allele,
+                     self.n_reads_C[self.top1allele],
+                     self.top1allele,
+                     self.n_reads_C[self.top2allele],
+                     self.top2allele,
+                     self.n_strand[0][self.top1allele],
+                     self.n_strand[0][self.top2allele],
+                     self.n_strand[1][self.top1allele],
+                     self.n_strand[1][self.top2allele],
+                     self.deltaBIC,
+                     self.BIC_homo_major,
+                     self.BIC_homo_minor,
+                     self.BIC_heter_noAS,
+                     self.BIC_heter_AS,
+                     self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele])
+                    )).decode()
+        vcf_format = "GT:DP:GQ:PL"
+        vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth(opt="all"), self.GQ, self.PL_00, self.PL_01, self.PL_11)
+        return "\t".join((vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample))
+
+    @cython.ccall
+    def toVariant(self):
+        v: Variant
+
+        v = Variant(self.ref_allele.decode(),
+                    self.alt_allele.decode(),
+                    self.GQ,
+                    '.',
+                    self.type,
+                    self.mutation_type,
+                    self.top1allele.decode(),
+                    self.top2allele.decode(),
+                    sum(self.n_reads_T.values()),
+                    sum(self.n_reads_C.values()),
+                    self.n_reads_T[self.top1allele],
+                    self.n_reads_T[self.top2allele],
+                    self.n_reads_C[self.top1allele],
+                    self.n_reads_C[self.top2allele],
+                    self.n_strand[0][self.top1allele],
+                    self.n_strand[0][self.top2allele],
+                    self.n_strand[1][self.top1allele],
+                    self.n_strand[1][self.top2allele],
+                    self.deltaBIC,
+                    self.BIC_homo_major,
+                    self.BIC_homo_minor,
+                    self.BIC_heter_noAS,
+                    self.BIC_heter_AS,
+                    self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]),
+                    self.GT,
+                    self.raw_read_depth(opt="all"),
+                    self.PL_00,
+                    self.PL_01,
+                    self.PL_11)
+        return v
diff --git a/MACS3/Signal/PosReadsInfo.pyx b/MACS3/Signal/PosReadsInfo.pyx
deleted file mode 100644
index 93d38348..00000000
--- a/MACS3/Signal/PosReadsInfo.pyx
+++ /dev/null
@@ -1,600 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2020-12-04 23:10:35 Tao Liu>
-
-"""Module for SAPPER PosReadsInfo class.
-
-Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included
-with the distribution).
-
-@status:  experimental
-@version: $Revision$
-@author:  Tao Liu
-@contact: tliu4@buffalo.edu
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-from MACS3.Signal.VariantStat import CalModel_Homo, CalModel_Heter_noAS, CalModel_Heter_AS, calculate_GQ, calculate_GQ_heterASsig
-from MACS3.Signal.Prob import binomial_cdf
-from MACS3.Signal.PeakVariants import Variant
-
-from cpython cimport bool
-
-import numpy as np
-cimport numpy as np
-from numpy cimport uint32_t, uint64_t, int32_t, float32_t
-
-LN10 = 2.3025850929940458
-
-cdef extern from "stdlib.h":
-    ctypedef unsigned int size_t
-    size_t strlen(char *s)
-    void *malloc(size_t size)
-    void *calloc(size_t n, size_t size)
-    void free(void *ptr)
-    int strcmp(char *a, char *b)
-    char * strcpy(char *a, char *b)
-    long atol(char *bytes)
-    int atoi(char *bytes)
-
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "Parser $Revision$"
-__author__ = "Tao Liu <tliu4@buffalo.edu>"
-__doc__ = "All Parser classes"
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-
-cdef class PosReadsInfo:
-    cdef:
-        long ref_pos
-        bytes ref_allele
-        bytes alt_allele
-        bool filterout          # if true, do not output
-
-        dict bq_set_T     #{A:[], C:[], G:[], T:[], N:[]} for treatment
-        dict bq_set_C
-        dict n_reads_T    #{A:[], C:[], G:[], T:[], N:[]} for treatment
-        dict n_reads_C
-        dict n_reads
-
-        list n_strand #[{A:[], C:[], G:[], T:[], N:[]},{A:[], C:[], G:[], T:[], N:[]}] for total appearance on plus strand and minus strand for ChIP sample only
-        dict n_tips # count of nt appearing at tips
-
-        bytes top1allele
-        bytes top2allele
-        float top12alleles_ratio
-
-        double lnL_homo_major,lnL_heter_AS,lnL_heter_noAS,lnL_homo_minor
-        double BIC_homo_major,BIC_heter_AS,BIC_heter_noAS,BIC_homo_minor
-        double PL_00, PL_01, PL_11
-        double deltaBIC
-        int heter_noAS_kc, heter_noAS_ki
-        int heter_AS_kc, heter_AS_ki
-        double heter_AS_alleleratio
-
-        int GQ_homo_major,GQ_heter_noAS,GQ_heter_AS  #phred scale of prob by standard formular
-        int GQ_heter_ASsig #phred scale of prob, to measure the difference between AS and noAS
-
-        double GQ
-        
-        str GT
-        str type
-        str mutation_type       # SNV or Insertion or Deletion
-
-        bool hasfermiinfor #if no fermi bam overlap in the position, false; if fermi bam in the position GT: N, false; if anyone of top2allele is not in fermi GT NTs, false;
-        bytearray fermiNTs # 
-
-    def __cinit__ ( self ):
-        self.filterout = False
-        self.GQ = 0
-        self.GT = "unsure"
-        self.alt_allele = b'.'
-        
-    def __init__ ( self, long ref_pos, bytes ref_allele ):
-        self.ref_pos = ref_pos
-        self.ref_allele = ref_allele
-        self.bq_set_T = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
-        self.bq_set_C = { ref_allele:[],b'A':[], b'C':[], b'G':[], b'T':[], b'N':[], b'*':[] }
-        self.n_reads_T = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
-        self.n_reads_C = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
-        self.n_reads =  { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
-        self.n_strand = [ { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }, { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 } ]
-        self.n_tips = { ref_allele:0,b'A':0, b'C':0, b'G':0, b'T':0, b'N':0, b'*':0 }
-        
-
-    #cpdef void merge ( self, PosReadsInfo PRI2 ):
-    #    """Merge two PRIs. No check available.
-    #
-    #    """
-    #    assert self.ref_pos == PRI2.ref_pos
-    #    assert self.ref_allele == PRI2.ref_allele
-    #    for b in set( self.n_reads.keys() ).union( set( PRI2.n_reads.keys() ) ):
-    #        self.bq_set_T[ b ] = self.bq_set_T.get( b, []).extend( PRI2.bq_set_T.get( b, [] ) )
-    #        self.bq_set_C[ b ] = self.bq_set_C.get( b, []).extend( PRI2.bq_set_C.get( b, [] ) )
-    #        self.n_reads_T[ b ] = self.n_reads_T.get( b, 0) + PRI2.n_reads_T.get( b, 0 )
-    #        self.n_reads_C[ b ] = self.n_reads_C.get( b, 0) + PRI2.n_reads_C.get( b, 0 )
-    #        self.n_reads[ b ] = self.n_reads.get( b, 0) + PRI2.n_reads.get( b, 0 )
-    #    return
-
-    def __getstate__ ( self ):
-        return ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
-                 self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
-                 self.top1allele, self.top2allele, self.top12alleles_ratio,
-                 self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
-                 self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
-                 self.heter_noAS_kc, self.heter_noAS_ki,
-                 self.heter_AS_kc, self.heter_AS_ki,
-                 self.heter_AS_alleleratio,
-                 self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
-                 self.GQ_heter_ASsig,
-                 self.GQ,
-                 self.GT,
-                 self.type,
-                 self.hasfermiinfor,
-                 self.fermiNTs )
-
-    def __setstate__ ( self, state ):
-        ( self.ref_pos, self.ref_allele, self.alt_allele, self.filterout,
-          self.bq_set_T, self.bq_set_C, self.n_reads_T, self.n_reads_C, self.n_reads, self.n_strand, self.n_tips,
-          self.top1allele, self.top2allele, self.top12alleles_ratio,
-          self.lnL_homo_major, self.lnL_heter_AS, self.lnL_heter_noAS, self.lnL_homo_minor,
-          self.BIC_homo_major, self.BIC_heter_AS, self.BIC_heter_noAS, self.BIC_homo_minor,
-          self.heter_noAS_kc, self.heter_noAS_ki,
-          self.heter_AS_kc, self.heter_AS_ki,
-          self.heter_AS_alleleratio,
-          self.GQ_homo_major, self.GQ_heter_noAS, self.GQ_heter_AS,
-          self.GQ_heter_ASsig,
-          self.GQ,
-          self.GT,
-          self.type,
-          self.hasfermiinfor,
-          self.fermiNTs ) = state
-
-    cpdef bool filterflag ( self ):
-        return self.filterout
-
-    cpdef void apply_GQ_cutoff ( self, int min_homo_GQ = 50, int min_heter_GQ = 100 ):
-        if self.filterout:
-            return
-        if self.type.startswith('homo') and self.GQ < min_homo_GQ:
-            self.filterout = True
-        elif self.type.startswith('heter') and self.GQ < min_heter_GQ:
-            self.filterout = True
-        return
-
-    cpdef void apply_deltaBIC_cutoff ( self, float min_delta_BIC = 10 ):
-        if self.filterout:
-            return
-        if self.deltaBIC < min_delta_BIC:
-            self.filterout = True
-        return
-
-    cpdef void add_T ( self, int read_index, bytes read_allele, int read_bq, int strand, bool tip, int Q=20 ):
-        """ Strand 0: plus, 1: minus
-
-        Q is the quality cutoff. By default, only consider Q20 or read_bq > 20.
-        """
-        if read_bq <= Q:
-            return
-        if not self.n_reads.has_key( read_allele ):
-            self.bq_set_T[read_allele] = []
-            self.bq_set_C[read_allele] = []
-            self.n_reads_T[read_allele] = 0
-            self.n_reads_C[read_allele] = 0
-            self.n_reads[read_allele] = 0
-            self.n_strand[ 0 ][ read_allele ] = 0
-            self.n_strand[ 1 ][ read_allele ] = 0
-            self.n_tips[read_allele] = 0
-        self.bq_set_T[read_allele].append( read_bq )
-        self.n_reads_T[ read_allele ] += 1
-        self.n_reads[ read_allele ] += 1
-        self.n_strand[ strand ][ read_allele ] += 1
-        if tip: self.n_tips[ read_allele ] += 1
-
-    cpdef void add_C ( self, int read_index, bytes read_allele, int read_bq, int strand, int Q=20 ):
-        if read_bq <= Q:
-            return
-        if not self.n_reads.has_key( read_allele ):
-            self.bq_set_T[read_allele] = []
-            self.bq_set_C[read_allele] = []
-            self.n_reads_T[read_allele] = 0
-            self.n_reads_C[read_allele] = 0
-            self.n_reads[read_allele] = 0
-            self.n_strand[ 0 ][ read_allele ] = 0
-            self.n_strand[ 1 ][ read_allele ] = 0
-            self.n_tips[read_allele] = 0
-        self.bq_set_C[read_allele].append( read_bq )
-        self.n_reads_C[ read_allele ] += 1
-        self.n_reads[ read_allele ] += 1
-        #self.n_strand[ strand ][ read_allele ] += 1
-
-    cpdef int raw_read_depth ( self, str opt = "all" ):
-        if opt == "all":
-            return sum( self.n_reads.values() )
-        elif opt == "T":
-            return sum( self.n_reads_T.values() )
-        elif opt == "C":
-            return sum( self.n_reads_C.values() )
-        else:
-            raise Exception( "opt should be either 'all', 'T' or 'C'." )
-
-    cpdef void update_top_alleles ( self, float min_top12alleles_ratio = 0.8, int min_altallele_count = 2, float max_allowed_ar = 0.95 ):
-    #cpdef update_top_alleles ( self, float min_top12alleles_ratio = 0.8 ):
-        """Identify top1 and top2 NT.  the ratio of (top1+top2)/total
-        """
-        cdef:
-            float r
-
-        [self.top1allele, self.top2allele] = sorted(self.n_reads, key=self.n_reads_T.get, reverse=True)[:2]
-
-        # if top2 allele count in ChIP is lower than
-        # min_altallele_count, or when allele ratio top1/(top1+top2)
-        # is larger than max_allowed_ar in ChIP, we won't consider
-        # this allele at all.  we set values of top2 allele in
-        # dictionaries to [] and ignore top2 allele entirely.
- 
-        # max(self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top2allele ]) < min_altallele_count
-        #if self.ref_pos == 52608504:
-        #    print self.ref_pos, self.n_reads_T[ self.top1allele ], self.n_reads_T[ self.top2allele ], self.n_reads_C[ self.top1allele ], self.n_reads_C[ self.top2allele ]
-        if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
-            self.filterout = True
-            return
-
-        if (len(self.top1allele)==1 and len(self.top2allele)==1) and ( self.top2allele != self.ref_allele and ( ( self.n_reads_T[ self.top2allele ] - self.n_tips[ self.top2allele ] ) < min_altallele_count ) or \
-                self.n_reads_T[ self.top1allele ]/(self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ]) > max_allowed_ar ):
-            self.bq_set_T[ self.top2allele ] = []
-            self.bq_set_C[ self.top2allele ] = []
-            self.n_reads_T[ self.top2allele ] = 0
-            self.n_reads_C[ self.top2allele ] = 0
-            self.n_reads[ self.top2allele ] = 0
-            self.n_tips[ self.top2allele ] = 0
-            if ( self.top1allele != self.ref_allele and ( self.n_reads_T[ self.top1allele ] - self.n_tips[ self.top1allele ] ) < min_altallele_count ):
-                self.bq_set_T[ self.top1allele ] = []
-                self.bq_set_C[ self.top1allele ] = []
-                self.n_reads_T[ self.top1allele ] = 0
-                self.n_reads_C[ self.top1allele ] = 0
-                self.n_reads[ self.top1allele ] = 0
-                self.n_tips[ self.top1allele ] = 0
-
-        if self.n_reads_T[ self.top1allele ] + self.n_reads_T[ self.top2allele ] == 0:
-            self.filterout = True
-            return
-
-        self.top12alleles_ratio = ( self.n_reads[ self.top1allele ] + self.n_reads[ self.top2allele ] ) /  sum( self.n_reads.values() )
-        if self.top12alleles_ratio < min_top12alleles_ratio:
-            self.filterout = True
-            return
-
-        if self.top1allele == self.ref_allele and self.n_reads[ self.top2allele ] == 0:
-            # This means this position only contains top1allele which is the ref_allele. So the GT must be 0/0
-            self.type = "homo_ref"
-            self.filterout = True
-            return
-        return
-
-    cpdef void top12alleles ( self ):
-        print ( self.ref_pos, self.ref_allele)
-        print ("Top1allele",self.top1allele, "Treatment", self.bq_set_T[self.top1allele], "Control", self.bq_set_C[self.top1allele])
-        print ("Top2allele",self.top2allele, "Treatment", self.bq_set_T[self.top2allele], "Control", self.bq_set_C[self.top2allele])
-    
-    cpdef void call_GT ( self, float max_allowed_ar = 0.99 ):
-        """Require update_top_alleles being called.
-        """
-        cdef:
-            np.ndarray[np.int32_t, ndim=1] top1_bq_T
-            np.ndarray[np.int32_t, ndim=1] top2_bq_T
-            np.ndarray[np.int32_t, ndim=1] top1_bq_C
-            np.ndarray[np.int32_t, ndim=1] top2_bq_C
-            int i
-            list top1_bq_T_l
-            list top2_bq_T_l
-            list top1_bq_C_l
-            list top2_bq_C_l
-            list tmp_mutation_type
-            bytes tmp_alt
-
-        if self.filterout:
-            return
-        
-        top1_bq_T = np.array( self.bq_set_T[ self.top1allele ], dtype="int32" )
-        top2_bq_T = np.array( self.bq_set_T[ self.top2allele ], dtype="int32" )
-        top1_bq_C = np.array( self.bq_set_C[ self.top1allele ], dtype="int32" )
-        top2_bq_C = np.array( self.bq_set_C[ self.top2allele ], dtype="int32" )
-        (self.lnL_homo_major, self.BIC_homo_major) = CalModel_Homo( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
-        (self.lnL_homo_minor, self.BIC_homo_minor) = CalModel_Homo( top2_bq_T, top2_bq_C, top1_bq_T, top1_bq_C )
-        (self.lnL_heter_noAS, self.BIC_heter_noAS) = CalModel_Heter_noAS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C )
-        (self.lnL_heter_AS, self.BIC_heter_AS)     = CalModel_Heter_AS( top1_bq_T, top1_bq_C, top2_bq_T, top2_bq_C, max_allowed_ar )
-
-        #if self.ref_pos == 71078525:
-        #    print "---"
-        #    print len( top1_bq_T ), len( top1_bq_C ), len( top2_bq_T ), len( top2_bq_C )
-        #    print self.lnL_homo_major, self.lnL_homo_minor, self.lnL_heter_noAS, self.lnL_heter_AS
-        #    print self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS, self.BIC_heter_AS
-             
-        if self.top1allele != self.ref_allele and self.n_reads[ self.top2allele ] == 0:
-            # in this case, there is no top2 nt (or socalled minor
-            # allele) in either treatment or control, we should assume
-            # it's a 1/1 genotype. We will take 1/1 if it passes BIC
-            # test (deltaBIC >=2), and will skip this loci if it can't
-            # pass the test.
-            
-            self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
-            if self.deltaBIC < 2:
-               self.filterout = True
-               return
-
-            self.type = "homo"
-            self.GT = "1/1"
-
-            self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
-            self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
-            self.PL_11 = -10.0 * self.lnL_homo_major / LN10
-
-            self.PL_00 = max( 0, self.PL_00 - self.PL_11 )
-            self.PL_01 = max( 0, self.PL_01 - self.PL_11 )
-            self.PL_11 = 0
-
-            self.GQ = min( self.PL_00, self.PL_01 )
-            self.alt_allele = self.top1allele
-        else:
-            # assign GQ, GT, and type
-            if self.ref_allele != self.top1allele and self.BIC_homo_major + 2 <= self.BIC_homo_minor and self.BIC_homo_major + 2 <= self.BIC_heter_noAS and self.BIC_homo_major + 2 <= self.BIC_heter_AS:
-                self.type = "homo"
-                self.deltaBIC = min( self.BIC_heter_noAS, self.BIC_heter_AS, self.BIC_homo_minor ) - self.BIC_homo_major
-                self.GT = "1/1"
-                self.alt_allele = self.top1allele
-
-                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
-                self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
-                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
-
-                self.PL_00 = self.PL_00 - self.PL_11
-                self.PL_01 = self.PL_01 - self.PL_11
-                self.PL_11 = 0
-
-                self.GQ = min( self.PL_00, self.PL_01 )
-                
-            elif self.BIC_heter_noAS + 2 <= self.BIC_homo_major and self.BIC_heter_noAS + 2 <= self.BIC_homo_minor and self.BIC_heter_noAS + 2 <= self.BIC_heter_AS :
-                self.type = "heter_noAS"
-                self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_noAS
-
-                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
-                self.PL_01 = -10.0 * self.lnL_heter_noAS / LN10
-                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
-
-                self.PL_00 = self.PL_00 - self.PL_01
-                self.PL_11 = self.PL_11 - self.PL_01
-                self.PL_01 = 0
-
-                self.GQ = min( self.PL_00, self.PL_11 )
-                
-            elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor and self.BIC_heter_AS + 2 <= self.BIC_heter_noAS:
-                self.type = "heter_AS"
-                self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - self.BIC_heter_AS
-
-                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
-                self.PL_01 = -10.0 * self.lnL_heter_AS / LN10
-                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
-
-                self.PL_00 = self.PL_00 - self.PL_01
-                self.PL_11 = self.PL_11 - self.PL_01
-                self.PL_01 = 0
-
-                self.GQ = min( self.PL_00, self.PL_11 )
-
-            elif self.BIC_heter_AS + 2 <= self.BIC_homo_major and self.BIC_heter_AS + 2 <= self.BIC_homo_minor:
-                # can't decide if it's noAS or AS
-                self.type = "heter_unsure"
-                self.deltaBIC = min( self.BIC_homo_major, self.BIC_homo_minor ) - max( self.BIC_heter_AS, self.BIC_heter_noAS )
-
-                self.PL_00 = -10.0 * self.lnL_homo_minor / LN10
-                self.PL_01 = -10.0 * max( self.lnL_heter_noAS, self.lnL_heter_AS ) / LN10
-                self.PL_11 = -10.0 * self.lnL_homo_major / LN10
-
-                self.PL_00 = self.PL_00 - self.PL_01
-                self.PL_11 = self.PL_11 - self.PL_01
-                self.PL_01 = 0
-
-                self.GQ = min( self.PL_00, self.PL_11 )
-                
-            elif self.ref_allele == self.top1allele and self.BIC_homo_major < self.BIC_homo_minor and self.BIC_homo_major < self.BIC_heter_noAS and self.BIC_homo_major < self.BIC_heter_AS:
-                self.type = "homo_ref"
-                # we do not calculate GQ if type is homo_ref
-                self.GT = "0/0"
-                self.filterout = True
-            else:
-                self.type="unsure"
-                self.filterout = True
-
-            if self.type.startswith( "heter" ):
-                if self.ref_allele == self.top1allele:
-                    self.alt_allele = self.top2allele
-                    self.GT = "0/1"
-                elif self.ref_allele == self.top2allele:
-                    self.alt_allele = self.top1allele
-                    self.GT = "0/1"
-                else:
-                    self.alt_allele = self.top1allele+b','+self.top2allele
-                    self.GT = "1/2"
-                # strand bias filter, uncomment following if wish to debug
-                # calculate SB score
-                #print "calculate SB score for ", self.ref_pos, "a/b/c/d:", self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ]
-                #SBscore = self.SB_score_ChIP( self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ] )
-                #SBscore = 0
-                #if SBscore >= 1:
-                #    print "disgard variant at", self.ref_pos, "type", self.type
-                #    self.filterout = True
-                
-                # if self.ref_allele == self.top1allele：
-                #     self.n_strand[ 0 ][ self.top1allele ] + self.n_strand[ 1 ][ self.top1allele ]
-                #     if and self.n_strand[ 0 ][ self.top2allele ] == 0 or self.n_strand[ 1 ][ self.top2allele ] == 0:
-                #         self.filterout = True
-                #         print self.ref_pos
-
-
-            # self.deltaBIC = self.deltaBIC
-
-        tmp_mutation_type = []
-        for tmp_alt in self.alt_allele.split(b','):
-            if tmp_alt == b'*':
-                tmp_mutation_type.append( "Deletion" )
-            elif len( tmp_alt ) > 1:
-                tmp_mutation_type.append( "Insertion" )
-            else:
-                tmp_mutation_type.append( "SNV" )
-        self.mutation_type = ",".join( tmp_mutation_type )
-        return
-
-    cdef float SB_score_ChIP( self, int a, int b, int c, int d ):
-        """ calculate score for filtering variants with strange strand biases.
-
-        a: top1/major allele plus strand
-        b: top2/minor allele plus strand
-        c: top1/major allele minus strand
-        d: top2/minor allele minus strand
-
-        Return a float value so that if this value >= 1, the variant will be filtered out.
-        """
-        cdef:
-            float score
-            double p
-            double p1_l, p1_r
-            double p2_l, p2_r
-            double top2_sb, top1_sb
-
-        if a+b == 0 or c+d == 0:
-            # if major allele and minor allele both bias to the same strand, allow it
-            return 0.0
-
-        # Rule:
-        # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
-        # or there is no significant bias (0.5) in top2 allele.
-        
-        #print a, b, c, d
-        p1_l = binomial_cdf( a, (a+c), 0.5, lower=True )      # alternative: less than 0.5
-        p1_r = binomial_cdf( c, (a+c), 0.5, lower=True )   #              greater than 0.5
-        p2_l = binomial_cdf( b, (b+d), 0.5, lower=True )      # alternative: less than 0.5
-        p2_r = binomial_cdf( d, (b+d), 0.5, lower=True )   #              greater than 0.5
-        #print p1_l, p1_r, p2_l, p2_r
-
-        if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
-            # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
-            return 1.0
-        else:
-            # if b<=2 and d=0 or b=0 and d<=2 -- highly possible FPs
-            #if ( b<=2 and d==0 or b==0 and d<=2 ):
-            #    return 1
-            # can't decide
-            return 0.0
-
-    cdef float SB_score_ATAC( self, int a, int b, int c, int d ):
-        """ calculate score for filtering variants with strange strand biases.
-
-        ATAC-seq version
-
-        a: top1/major allele plus strand
-        b: top2/minor allele plus strand
-        c: top1/major allele minus strand
-        d: top2/minor allele minus strand
-
-        Return a float value so that if this value >= 1, the variant will be filtered out.
-        """
-        cdef:
-            float score
-            double p
-            double p1_l, p1_r
-            double p2_l, p2_r
-            double top2_sb, top1_sb
-
-        if a+b == 0 or c+d == 0:
-            # if major allele and minor allele both bias to the same strand, allow it
-            return 0.0
-
-        # Rule:
-        # if there is bias in top2 allele then bias in top1 allele should not be significantly smaller than it.
-        # or there is no significant bias (0.5) in top2 allele.
-        
-        #print a, b, c, d
-        p1_l = binomial_cdf( a, (a+c), 0.5, lower=True )      # alternative: less than 0.5
-        p1_r = binomial_cdf( c, (a+c), 0.5, lower=True )   #              greater than 0.5
-        p2_l = binomial_cdf( b, (b+d), 0.5, lower=True )      # alternative: less than 0.5
-        p2_r = binomial_cdf( d, (b+d), 0.5, lower=True )   #              greater than 0.5
-        #print p1_l, p1_r, p2_l, p2_r
-
-        if (p1_l < 0.05 and p2_r < 0.05) or (p1_r < 0.05 and p2_l < 0.05):
-            # we reject loci where the significant biases are inconsistent between top1 and top2 alleles.
-            return 1.0
-        else:
-            # can't decide
-            return 0.0
-
-    cpdef str to_vcf ( self ):
-        """Output REF,ALT,QUAL,FILTER,INFO,FORMAT, SAMPLE columns.
-        """
-        cdef:
-            str vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample
-
-        vcf_ref = self.ref_allele.decode()
-        vcf_alt = self.alt_allele.decode()
-        vcf_qual = "%d" % self.GQ
-        vcf_filter = "."
-        vcf_info = (b"M=%s;MT=%s;DPT=%d;DPC=%d;DP1T=%d%s;DP2T=%d%s;DP1C=%d%s;DP2C=%d%s;SB=%d,%d,%d,%d;DBIC=%.2f;BICHOMOMAJOR=%.2f;BICHOMOMINOR=%.2f;BICHETERNOAS=%.2f;BICHETERAS=%.2f;AR=%.2f" % \
-            (self.type.encode(), self.mutation_type.encode(), sum( self.n_reads_T.values() ), sum( self.n_reads_C.values() ), 
-             self.n_reads_T[self.top1allele], self.top1allele, self.n_reads_T[self.top2allele], self.top2allele,
-             self.n_reads_C[self.top1allele], self.top1allele, self.n_reads_C[self.top2allele], self.top2allele,
-             self.n_strand[ 0 ][ self.top1allele ], self.n_strand[ 0 ][ self.top2allele ], self.n_strand[ 1 ][ self.top1allele ], self.n_strand[ 1 ][ self.top2allele ],
-             self.deltaBIC,
-             self.BIC_homo_major, self.BIC_homo_minor, self.BIC_heter_noAS,self.BIC_heter_AS,
-             self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele])
-             )).decode()
-        vcf_format = "GT:DP:GQ:PL"
-        vcf_sample = "%s:%d:%d:%d,%d,%d" % (self.GT, self.raw_read_depth( opt = "all" ), self.GQ, self.PL_00, self.PL_01, self.PL_11)
-        return "\t".join( ( vcf_ref, vcf_alt, vcf_qual, vcf_filter, vcf_info, vcf_format, vcf_sample ) )
-
-    cpdef toVariant ( self ):
-        cdef:
-            object v
-        v = Variant( 
-                     self.ref_allele.decode(),
-                     self.alt_allele.decode(),
-                     self.GQ,
-                     '.',
-                     self.type,
-                     self.mutation_type,
-                     self.top1allele.decode(),
-                     self.top2allele.decode(),
-                     sum( self.n_reads_T.values() ),
-                     sum( self.n_reads_C.values() ),
-                     self.n_reads_T[self.top1allele],
-                     self.n_reads_T[self.top2allele],
-                     self.n_reads_C[self.top1allele],
-                     self.n_reads_C[self.top2allele],
-                     self.n_strand[ 0 ][ self.top1allele ],
-                     self.n_strand[ 0 ][ self.top2allele ],
-                     self.n_strand[ 1 ][ self.top1allele ],
-                     self.n_strand[ 1 ][ self.top2allele ],
-                     self.deltaBIC,
-                     self.BIC_homo_major,
-                     self.BIC_homo_minor,
-                     self.BIC_heter_noAS,
-                     self.BIC_heter_AS,
-                     self.n_reads_T[self.top1allele]/(self.n_reads_T[self.top1allele]+self.n_reads_T[self.top2allele]),
-                     self.GT,
-                     self.raw_read_depth( opt = "all" ),
-                     self.PL_00,
-                     self.PL_01,
-                     self.PL_11 )
-        return v
diff --git a/MACS3/Signal/RACollection.pxd b/MACS3/Signal/RACollection.pxd
new file mode 100644
index 00000000..ce14c068
--- /dev/null
+++ b/MACS3/Signal/RACollection.pxd
@@ -0,0 +1,61 @@
+cdef extern from "fml.h":
+    ctypedef struct bseq1_t:
+        int l_seq
+        char *seq
+        char *qual # NULL-terminated strings; length expected to match $l_seq
+
+    ctypedef struct magopt_t:
+        int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth
+        float min_dratio1, max_bcov, max_bfrac
+
+    ctypedef struct fml_opt_t:
+        int n_threads        # number of threads; don't use multi-threading for small data sets
+        int ec_k             # k-mer length for error correction; 0 for auto estimate
+        int min_cnt, max_cnt # both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
+        int min_asm_ovlp     # min overlap length during assembly
+        int min_merge_len    # during assembly, don't explicitly merge an overlap if shorter than this value
+        magopt_t mag_opt     # graph cleaning options
+
+    ctypedef struct fml_ovlp_t:
+        unsigned int len_, from_, id_, to_ 
+        #unit32_t from  # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
+        #unsigned int id
+        #unsigned int to    # $id: unitig number
+
+    ctypedef struct fml_utg_t:
+        int len      # length of sequence
+        int nsr      # number of supporting reads
+        char *seq        # unitig sequence
+        char *cov        # cov[i]-33 gives per-base coverage at i
+        int n_ovlp[2]    # number of 5'-end [0] and 3'-end [1] overlaps
+        fml_ovlp_t *ovlp # overlaps, of size n_ovlp[0]+n_ovlp[1]
+
+    void fml_opt_init(fml_opt_t *opt)
+    fml_utg_t* fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg)
+    void fml_utg_destroy(int n_utg, fml_utg_t *utg)
+    void fml_utg_print(int n_utgs, const fml_utg_t *utg)
+    bseq1_t *bseq_read(const char *fn, int *n)
+
+# --- end of fermi-lite functions ---
+
+# --- smith-waterman alignment functions ---
+
+cdef extern from "swalign.h":
+    ctypedef struct seq_pair_t:
+        char *a
+        unsigned int alen
+        char *b
+        unsigned int blen
+    ctypedef struct align_t:
+        seq_pair_t *seqs
+        char *markup;
+        int start_a
+        int start_b
+        int end_a
+        int end_b
+        int matches
+        int gaps
+        double score
+    align_t *smith_waterman(seq_pair_t *problem)
+    void destroy_seq_pair(seq_pair_t *pair)
+    void destroy_align(align_t *ali)
diff --git a/MACS3/Signal/RACollection.py b/MACS3/Signal/RACollection.py
new file mode 100644
index 00000000..6f8ca04b
--- /dev/null
+++ b/MACS3/Signal/RACollection.py
@@ -0,0 +1,906 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-22 16:26:57 Tao Liu>
+
+"""Module for ReadAlignment collection
+
+Copyright (c) 2024 Tao Liu
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+
+@status:  experimental
+@version: $Revision$
+@author:  Tao Liu
+@contact: vladimir.liu@gmail.com
+"""
+# ------------------------------------
+# python modules
+# ------------------------------------
+from collections import Counter
+from operator import itemgetter
+from copy import copy
+
+from MACS3.Signal.ReadAlignment import ReadAlignment
+from MACS3.Signal.PosReadsInfo import PosReadsInfo
+from MACS3.Signal.UnitigRACollection import UnitigRAs, UnitigCollection
+from MACS3.IO.PeakIO import PeakIO
+
+import cython
+from cython.cimports.cpython import bool
+# from cython.cimports.cpython.mem import PyMem_Malloc, PyMem_Free
+
+from cython.cimports.libc.stdlib import malloc, free
+
+# ------------------------------------
+# constants
+# ------------------------------------
+__version__ = "Parser $Revision$"
+__author__ = "Tao Liu <tliu4@buffalo.edu>"
+__doc__ = "All Parser classes"
+
+__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
+
+__CIGARCODE__ = "MIDNSHP=X"
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class RACollection:
+    """A collection of ReadAlignment objects and the corresponding
+    PeakIO.
+
+    """
+    chrom: bytes
+    peak: PeakIO             # A PeakIO object
+    RAlists: list           # contain ReadAlignment lists for treatment (0) and control (1)
+    left: cython.long               # left position of peak
+    right: cython.long              # right position of peak
+    length: cython.long             # length of peak
+    RAs_left: cython.long           # left position of all RAs in the collection
+    RAs_right: cython.long          # right position of all RAs in the collection
+    sorted: bool             # if sorted by lpos
+    peak_refseq: bytes       # reference sequence in peak region b/w left and right
+    peak_refseq_ext: bytes   # reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right
+
+    def __init__(self, chrom: bytes, peak: PeakIO, RAlist_T: list, RAlist_C: list = []):
+        """Create RACollection by: object taking:
+
+        1. peak: a PeakIO indicating: object the peak region.
+
+        2. RAlist: a python of: list ReadAlignment objects containing
+        all the reads overlapping the peak region. If no RAlist_C
+        given, it will be [].
+
+        """
+        if len(RAlist_T) == 0:
+            # no reads, return None
+            raise Exception("No reads from ChIP sample to construct RAcollection!")
+        self.chrom = chrom
+        self.peak = peak
+        # print(len(RAlist_T),"\n")
+        # print(len(RAlist_C),"\n")
+        self.RAlists = [RAlist_T, RAlist_C]
+        self.left = peak["start"]
+        self.right = peak["end"]
+        self.length = self.right - self.left
+        if RAlist_T:
+            self.RAs_left = RAlist_T[0]["lpos"]  # initial assignment of RAs_left
+            self.RAs_right = RAlist_T[-1]["rpos"]  # initial assignment of RAs_right
+            self.sort()                           # it will set self.sorted = True
+        else:
+            self.RAs_left = -1
+            self.RAs_right = -1
+        # check RAs_left and RAs_right
+        for ra in RAlist_T:
+            if ra["lpos"] < self.RAs_left:
+                self.RAs_left = ra["lpos"]
+            if ra["rpos"] > self.RAs_right:
+                self.RAs_right = ra["rpos"]
+
+        for ra in RAlist_C:
+            if ra["lpos"] < self.RAs_left:
+                self.RAs_left = ra["lpos"]
+            if ra["rpos"] > self.RAs_right:
+                self.RAs_right = ra["rpos"]
+        (self.peak_refseq, self.peak_refseq_ext) = self.__get_peak_REFSEQ()
+
+    def __getitem__(self, keyname):
+        if keyname == "chrom":
+            return self.chrom
+        elif keyname == "left":
+            return self.left
+        elif keyname == "right":
+            return self.right
+        elif keyname == "RAs_left":
+            return self.RAs_left
+        elif keyname == "RAs_right":
+            return self.RAs_right
+        elif keyname == "length":
+            return self.length
+        elif keyname == "count":
+            return len(self.RAlists[0]) + len(self.RAlists[1])
+        elif keyname == "count_T":
+            return len(self.RAlists[0])
+        elif keyname == "count_C":
+            return len(self.RAlists[1])
+        elif keyname == "peak_refseq":
+            return self.peak_refseq
+        elif keyname == "peak_refseq_ext":
+            return self.peak_refseq_ext
+        else:
+            raise KeyError("Unavailable key:", keyname)
+
+    def __getstate__(self):
+        #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists,
+        #        "left":self.left, "right":self.right, "length": self.length,
+        #        "RAs_left":self.RAs_left, "RAs_right":self.RAs_right}
+        return (self.chrom, self.peak, self.RAlists, self.left, self.right,
+                self.length, self.RAs_left, self.RAs_right, self.peak_refseq,
+                self.peak_refseq_ext)
+
+    def __setstate__(self, state):
+        (self.chrom, self.peak, self.RAlists, self.left, self.right,
+         self.length, self.RAs_left, self.RAs_right, self.peak_refseq,
+         self.peak_refseq_ext) = state
+
+    @cython.ccall
+    def sort(self):
+        """Sort RAs according to lpos. Should be used after realignment.
+
+        """
+        if self.RAlists[0]:
+            self.RAlists[0].sort(key=itemgetter("lpos"))
+        if self.RAlists[1]:
+            self.RAlists[1].sort(key=itemgetter("lpos"))
+        self.sorted = True
+        return
+
+    @cython.ccall
+    def remove_outliers(self, percent: cython.int = 5):
+        """ Remove outliers with too many n_edits. The outliers with
+        n_edits in top p% will be removed.
+
+        Default: remove top 5% of reads that have too many differences
+        with reference genome.
+        """
+        n_edits_list: list
+        ralist: list
+        read: ReadAlignment         # ReadAlignment object
+        highest_n_edits: cython.int
+        new_RAlist: list
+        i: cython.int
+
+        n_edits_list = []
+        for ralist in self.RAlists:
+            for read in ralist:
+                n_edits_list.append(read["n_edits"])
+        n_edits_list.sort()
+        highest_n_edits = n_edits_list[int(len(n_edits_list) * (1 - percent * .01))]
+
+        for i in (range(len(self.RAlists))):
+            new_RAlist = []
+            for read in self.RAlists[i]:
+                if read["n_edits"] <= highest_n_edits:
+                    new_RAlist.append(read)
+            self.RAlists[i] = new_RAlist
+
+        return
+
+    @cython.ccall
+    def n_edits_sum(self) -> cython.int:
+        """
+        """
+        n_edits_list: list
+        ralist: list
+        read: ReadAlignment
+        c: cython.int
+        # highest_n_edits: cython.int
+
+        n_edits_list = []
+
+        for ralist in self.RAlists:
+            for read in ralist:
+                n_edits_list.append(read["n_edits"])
+
+        n_edits_list.sort()
+        # print (n_edits_list)
+        c = Counter(n_edits_list)
+        return c
+        # print(c)
+
+    @cython.cfunc
+    def __get_peak_REFSEQ(self) -> tuple:
+        """Get the reference sequence within the peak region.
+
+        """
+        peak_refseq: bytearray
+        # i: cython.int
+        # prev_r: cython.long                   #remember the previous filled right end
+        start: cython.long
+        end: cython.long
+        # ind: cython.long
+        # ind_r: cython.long
+        # read: ReadAlignment
+        # read_refseq_ext: bytearray
+        # read_refseq: bytearray
+
+        start = min(self.RAs_left, self.left)
+        end = max(self.RAs_right, self.right)
+        # print ("left",start,"right",end)
+        peak_refseq_ext = bytearray(b'N' * (end - start))
+
+        # for treatment.
+        peak_refseq_ext = self.__fill_refseq(peak_refseq_ext,
+                                             self.RAlists[0])
+        # and control if available.
+        if self.RAlists[1]:
+            peak_refseq_ext = self.__fill_refseq(peak_refseq_ext,
+                                                 self.RAlists[1])
+
+        # trim
+        peak_refseq = peak_refseq_ext[self.left - start: self.right - start]
+        return (bytes(peak_refseq), bytes(peak_refseq_ext))
+
+    @cython.cfunc
+    def __fill_refseq(self,
+                      seq: bytearray,
+                      ralist: list) -> bytearray:
+        """Fill refseq sequence of whole peak with refseq sequences of
+        each read in ralist.
+
+        """
+        prev_r: cython.long         # previous right position of last
+        # filled
+        ind: cython.long
+        ind_r: cython.long
+        start: cython.long
+        # end: cython.long
+        read: ReadAlignment
+        read_refseq: bytearray
+
+        start = min(self.RAs_left, self.left)
+
+        # print(len(ralist),"\n")
+        prev_r = ralist[0]["lpos"]
+
+        for i in range(len(ralist)):
+            read = ralist[i]
+            if read["lpos"] > prev_r:
+                read = ralist[i - 1]
+                read_refseq = read.get_REFSEQ()
+                ind = read["lpos"] - start
+                ind_r = ind + read["rpos"] - read["lpos"]
+                seq[ind: ind_r] = read_refseq
+                prev_r = read["rpos"]
+        # last
+        read = ralist[-1]
+        read_refseq = read.get_REFSEQ()
+        ind = read["lpos"] - start
+        ind_r = ind + read["rpos"] - read["lpos"]
+        seq[ind: ind_r] = read_refseq
+        return seq
+
+    @cython.ccall
+    def get_PosReadsInfo_ref_pos(self,
+                                 ref_pos: cython.long,
+                                 ref_nt: bytes,
+                                 Q: cython.int = 20):
+        """Generate a PosReadsInfo for: object a given reference genome
+        position.
+
+        Return a PosReadsInfo object.
+
+        """
+        s: bytearray
+        bq: bytearray
+        strand: cython.int
+        ra: ReadAlignment
+        # bq_list_t: list = []
+        # bq_list_c: list = []
+        i: cython.int
+        pos: cython.int
+        tip: bool
+        posreadsinfo_p: PosReadsInfo
+
+        posreadsinfo_p = PosReadsInfo(ref_pos, ref_nt)
+
+        # Treatment group
+        for i in range(len(self.RAlists[0])):
+            ra = self.RAlists[0][i]
+            if ra["lpos"] <= ref_pos and ra["rpos"] > ref_pos:
+                (s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos(ref_pos)
+                posreadsinfo_p.add_T(i, bytes(s), bq[0], strand, tip, Q=Q)
+
+        # Control group
+        for i in range(len(self.RAlists[1])):
+            ra = self.RAlists[1][i]
+            if ra["lpos"] <= ref_pos and ra["rpos"] > ref_pos:
+                (s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos(ref_pos)
+                posreadsinfo_p.add_C(i, bytes(s), bq[0], strand, Q=Q)
+
+        return posreadsinfo_p
+
+    @cython.ccall
+    def get_FASTQ(self) -> bytearray:
+        """Get FASTQ file for all reads in RACollection.
+
+        """
+        ra: ReadAlignment
+        fastq_text: bytearray
+
+        fastq_text = bytearray(b"")
+
+        for ra in self.RAlists[0]:
+            fastq_text += ra.get_FASTQ()
+
+        for ra in self.RAlists[1]:
+            fastq_text += ra.get_FASTQ()
+
+        return fastq_text
+
+    @cython.cfunc
+    def fermi_assemble(self,
+                       fermiMinOverlap: cython.int,
+                       opt_flag: cython.int = 0x80) -> list:
+        """A wrapper function to call Fermi unitig building functions.
+        """
+        opt: cython.pointer(fml_opt_t)
+        # c: cython.int
+        n_seqs: cython.int
+        n_utg: cython.pointer(cython.int)
+        seqs: cython.pointer(bseq1_t)
+        utg: cython.pointer(fml_utg_t)
+        p: fml_utg_t
+
+        # unitig_k: cython.int
+        # merge_min_len: cython.int
+        tmps: bytes
+        tmpq: bytes
+        # ec_k: cython.int = -1
+        l: cython.long
+        cseq: cython.pointer(cython.char)
+        cqual: cython.pointer(cython.char)
+        i: cython.int
+        j: cython.int
+        # tmpunitig: bytes
+        # unitig: bytes                 # final unitig
+        unitig_list: list             # contain of: list sequences in format: bytes
+        # n: cython.pointer(cython.int)
+
+        n_seqs = len(self.RAlists[0]) + len(self.RAlists[1])
+
+        # prn_seqs: cython.int
+
+        # prepare seq and qual, note, we only extract SEQ according to the +
+        # strand of reference sequence.
+        seqs = cython.cast(cython.pointer(bseq1_t),
+                           malloc(n_seqs * cython.sizeof(bseq1_t)))  # we rely on fermi-lite to free this mem
+
+        i = 0
+        for ra in self.RAlists[0]:
+            tmps = ra["SEQ"]
+            tmpq = ra["QUAL"]
+            l = len(tmps)
+            # we rely on fermi-lite to free this mem
+            cseq = cython.cast(cython.pointer(cython.char),
+                               malloc((l+1)*cython.sizeof(cython.char)))
+            # we rely on fermi-lite to free this mem
+            cqual = cython.cast(cython.pointer(cython.char),
+                                malloc((l+1)*cython.sizeof(cython.char)))
+            for j in range(l):
+                cseq[j] = tmps[j]
+                cqual[j] = tmpq[j] + 33
+            cseq[l] = b'\x00'
+            cqual[l] = b'\x00'
+
+            seqs[i].seq = cseq
+            seqs[i].qual = cqual
+            seqs[i].l_seq = len(tmps)
+            i += 1
+
+            # print "@",ra["readname"].decode()
+            # prcseq: cython.int.decode()
+            # print "+"
+            # prcqual: cython.int.decode()
+
+        for ra in self.RAlists[1]:
+            tmps = ra["SEQ"]
+            tmpq = ra["QUAL"]
+            l = len(tmps)
+            # we rely on fermi-lite to free this mem
+            cseq = cython.cast(cython.pointer(cython.char),
+                               malloc((l+1)*cython.sizeof(cython.char)))
+            # we rely on fermi-lite to free this mem
+            cqual = cython.cast(cython.pointer(cython.char),
+                                malloc((l+1)*cython.sizeof(cython.char)))
+            for j in range(l):
+                cseq[j] = tmps[j]
+                cqual[j] = tmpq[j] + 33
+            cseq[l] = b'\x00'
+            cqual[l] = b'\x00'
+
+            seqs[i].seq = cseq
+            seqs[i].qual = cqual
+            seqs[i].l_seq = len(tmps)
+            i += 1
+            # print "@",ra["readname"].decode()
+            # prcseq: cython.int.decode()
+            # print "+"
+            # prcqual: cython.int.decode()
+
+        # if self.RAlists[1]:
+        #     unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio)
+
+        #     merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5)
+        # else:
+        #     unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
+
+        #     merge_min_len=int(self.RAlists[0][0]["l"]*0.5)
+        #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
+
+        # minimum overlap to merge, default 0
+        # merge_min_len= max(25, int(self.RAlists[0][0]["l"]*0.5))
+        # merge_min_len= int(self.RAlists[0][0]["l"]*0.5)
+
+        # opt = cython.cast(cython.pointer(fml_opt_t),
+        #                   PyMem_Malloc(cython.sizeof(fml_opt_t)))
+        # n_utg = cython.cast(cython.pointer(cython.int),
+        #                     PyMem_Malloc(cython.sizeof(int)))
+
+        opt = cython.cast(cython.pointer(fml_opt_t),
+                          malloc(cython.sizeof(fml_opt_t)))
+        n_utg = cython.cast(cython.pointer(cython.int),
+                            malloc(cython.sizeof(int)))
+
+        fml_opt_init(opt)
+        # k-mer length for error correction (0 for auto; -1 to disable)
+        # opt.ec_k = 0
+
+        # min overlap length during initial assembly
+        opt.min_asm_ovlp = fermiMinOverlap
+
+        # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value
+        # opt.min_merge_len = merge_min_len
+
+        # there are more 'options' for mag clean:
+        # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac
+        # min_elen (300) will be adjusted
+        # min_ensr (4), min_insr (3) will be computed
+        # min_merge_len (0) will be updated using opt.min_merge_len
+
+        # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15)
+
+        # 0x20: MAG_F_AGGRESSIVE pop variant bubbles
+        # 0x40: MAG_F_POPOPEN aggressive tip trimming
+        # 0x80: MAG_F_NO_SIMPL skip bubble simplification
+        opt.mag_opt.flag = opt_flag
+
+        # mag_opt.min_ovlp
+        #opt.mag_opt.min_ovlp = fermiMinOverlap
+
+        # drop an overlap if its length is below maxOvlpLen*FLOAT
+        #opt.mag_opt.min_dratio1 = 0.5
+
+        # retain a bubble if one side is longer than the other side by >INT-bp
+        #opt.mag_opt.max_bdiff = 10#merge_min_len
+
+        # trim_len:
+        # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing
+
+        # max_bdist:
+        # max_bvtx: Parameter used to simply bubble while 0x80 flag is set.
+        #opt.mag_opt.max_bdist = 1024
+        #opt.mag_opt.max_bvtx = 128
+
+        # max_bcov:
+        # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac
+        #opt.mag_opt.max_bcov = 10.
+        #opt.mag_opt.max_bfrac = 0.01
+
+        utg = fml_assemble(opt, n_seqs, seqs, n_utg)
+        # get results
+        unitig_list = []
+        for i in range(n_utg[0]):
+            p = utg[i]
+            if (p.len < 0):
+                continue
+            # unitig = b''
+            # for j in range(p.len):
+            #    unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1]
+            # unitig_list.append(unitig)
+            unitig_list.append(p.seq)
+
+        fml_utg_destroy(n_utg[0], utg)
+
+        # PyMem_Free(opt)
+        # PyMem_Free(n_utg)
+        free(opt)
+        free(n_utg)
+
+        return unitig_list
+
+    @cython.cfunc
+    def align_unitig_to_REFSEQ(self, unitig_list: list) -> tuple:
+        """Note: we use smith waterman, but we don't use linear gap
+        penalty at this time.
+
+        Also, if unitig is mapped to - strand, we will revcomp the
+        unitig. So the unitig_will: list be changed in this case.
+        """
+        unitig: bytes
+        problem: seq_pair_t
+        results: cython.pointer(align_t)
+        # tmp: cython.pointer(cython.char)
+        target: bytes
+        reference: bytes
+        target_aln_f: bytes
+        target_aln_r: bytes
+        reference_aln_f: bytes
+        reference_aln_r: bytes
+        markup_aln_f: bytes
+        markup_aln_r: bytes
+        score_f: cython.double
+        score_r: cython.double
+        target_alns: list = []
+        reference_alns: list = []
+        markup_alns: list = []
+        aln_scores: list = []
+        i: cython.int
+
+        reference = copy(self.peak_refseq_ext+b'\x00')
+
+        for i in range(len(unitig_list)):
+            unitig = unitig_list[i]
+            target = copy(unitig + b'\x00')
+            # we use swalign.c for local alignment (without affine gap
+            # penalty). Will revise later.
+            problem.a = target
+            problem.alen = len(unitig)
+            problem.b = reference
+            problem.blen = len(self.peak_refseq_ext)
+            results = smith_waterman(cython.address(problem))
+            target_aln_f = results.seqs.a
+            reference_aln_f = results.seqs.b
+            markup_aln_f = results.markup
+            score_f = results.score
+            free(results.seqs.a)
+            free(results.seqs.b)
+            free(results.markup)
+            free(results)
+            # end of local alignment
+
+            # try reverse complement
+            target = copy(unitig[::-1] + b'\x00')
+            target = target.translate(__DNACOMPLEMENT__)
+            problem.a = target
+            problem.alen = len(unitig)
+            problem.b = reference
+            problem.blen = len(self.peak_refseq_ext)
+            results = smith_waterman(cython.address(problem))
+            target_aln_r = results.seqs.a
+            reference_aln_r = results.seqs.b
+            markup_aln_r = results.markup
+            score_r = results.score
+            free(results.seqs.a)
+            free(results.seqs.b)
+            free(results.markup)
+            free(results)
+            # end of local alignment
+
+            if score_f > score_r:
+                target_alns.append(target_aln_f)
+                reference_alns.append(reference_aln_f)
+                markup_alns.append(markup_aln_f)
+                aln_scores.append(score_f)
+            else:
+                target_alns.append(target_aln_r)
+                reference_alns.append(reference_aln_r)
+                markup_alns.append(markup_aln_r)
+                aln_scores.append(score_r)
+                # we will revcomp unitig
+                unitig = unitig[::-1]
+                unitig_list[i] = unitig.translate(__DNACOMPLEMENT__)
+
+        return (target_alns, reference_alns, aln_scores, markup_alns)
+
+    @cython.cfunc
+    def verify_alns(self, unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns, min_score_100: cython.float = 150):
+        """Remove aln/unitig if it contains too many edits in a small region
+
+        default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region.
+        """
+        i: cython.int
+
+        for i in range(len(unitig_list)-1, -1, -1):
+            # pri: cython.int, aln_scores[i]
+            # prunitig_alns: cython.int[i]
+            # prmarkup_alns: cython.int[i]
+            # prreference_alns: cython.int[i]
+            if aln_scores[i] * 100 / len(markup_alns[i]) < min_score_100:
+                unitig_list.pop(i)
+                unitig_alns.pop(i)
+                reference_alns.pop(i)
+                aln_scores.pop(i)
+                markup_alns.pop(i)
+        return
+
+    @cython.cfunc
+    def filter_unitig_with_bad_aln(self, unitig_list: list,
+                                   target_alns: list,
+                                   reference_alns: list,
+                                   gratio: float = 0.25) -> tuple:
+        """Remove unitigs that has too much gaps (both on target and
+        reference) during alignments.
+
+        """
+        pass
+
+    @cython.cfunc
+    def remap_RAs_w_unitigs(self, unitig_list: list) -> list:
+        """Remap RAs to unitigs, requiring perfect match.
+
+        Return RAlists_T, RAlists_C, unmapped_racollection.
+        """
+        RAlists_T: list = []  # lists of of: list RAs of ChIP mapped to each unitig
+        RAlists_C: list = []
+        unmapped_RAlist_T: list = []  # of: list RAs of ChIP unmappable to unitigs
+        unmapped_RAlist_C: list = []
+        # RACollection unmapped_ra_collection
+        flag: cython.int = 0
+        i: cython.int
+        tmp_ra: ReadAlignment
+        tmp_ra_seq: bytes
+        unitig: bytes
+
+        for i in range(len(unitig_list)):
+            RAlists_T.append([])         # for each unitig, there is another of: list RAs
+            RAlists_C.append([])
+
+        # assign RAs to unitigs
+
+        for tmp_ra in self.RAlists[0]:
+            flag = 0
+            tmp_ra_seq = tmp_ra["SEQ"]
+            for i in range(len(unitig_list)):
+                unitig = unitig_list[i]
+                if tmp_ra_seq in unitig:
+                    flag = 1
+                    RAlists_T[i].append(tmp_ra)
+                    break
+            if flag == 0:
+                unmapped_RAlist_T.append(tmp_ra)
+                # print "unmapped:", tmp_ra["SEQ"]
+
+        for tmp_ra in self.RAlists[1]:
+            flag = 0
+            tmp_ra_seq = tmp_ra["SEQ"]
+            for i in range(len(unitig_list)):
+                unitig = unitig_list[i]
+                if tmp_ra_seq in unitig:
+                    flag = 1
+                    RAlists_C[i].append(tmp_ra)
+                    break
+            if flag == 0:
+                unmapped_RAlist_C.append(tmp_ra)
+                # print "unmapped:", tmp_ra["SEQ"]
+
+        # if unmapped_RAlist_T:
+        # unmapped_ra_collection = RACollection(self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C)
+        return [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C]
+
+    @cython.cfunc
+    def add_to_unitig_list(self, unitig_list, unitigs_2nd) -> list:
+        """
+        """
+        i: cython.int
+        j: cython.int
+        flag: cython.int
+        u0: bytes
+        u1: bytes
+        new_unitig_list: list
+
+        new_unitig_list = []
+
+        for i in range(len(unitigs_2nd)):
+            # initial value: can't be found in unitig_list
+            flag = 0
+            u0 = unitigs_2nd[i]
+            for j in range(len(unitig_list)):
+                u1 = unitig_list[j]
+                if u1.find(u0) != -1:
+                    flag = 1
+                    break
+                u1 = u1[::-1].translate(__DNACOMPLEMENT__)
+                if u1.find(u0) != -1:
+                    flag = 1
+                    break
+            if not flag:
+                new_unitig_list.append(u0)
+        new_unitig_list.extend(unitig_list)
+        return new_unitig_list
+
+    @cython.ccall
+    def build_unitig_collection(self, fermiMinOverlap):
+        """unitig_and: list tuple_alns are in the same order!
+
+        return UnitigCollection object.
+
+        """
+        start: cython.long
+        end: cython.long
+        unitigs_2nd: list
+        # u: bytes
+        # target_alns: list
+        reference_alns: list
+        aln_scores: list
+        markup_alns: list
+        # target_alns_2nd: list
+        # reference_alns_2nd: list
+        # aln_scores_2nd: list
+        RAlists_T: list = []  # lists of of: list RAs of ChIP mapped to each unitig
+        RAlists_C: list = []
+        unmapped_RAlist_T: list = []
+        unmapped_RAlist_C: list = []
+        # tmp_unitig_seq: bytes
+        tmp_reference_seq: bytes
+        tmp_unitig_aln: bytes
+        tmp_reference_aln: bytes
+
+        i: cython.int
+        j: cython.int
+        left_padding_ref: cython.long
+        right_padding_ref: cython.long
+        left_padding_unitig: cython.long
+        right_padding_unitig: cython.long
+        ura_list: list = []
+        unmapped_ra_collection: RACollection
+        # flag: cython.int = 0
+        # n_unmapped: cython.int
+        n_unitigs_0: cython.int
+        n_unitigs_1: cython.int
+
+        # first round of assembly
+        # print (" First round to assemble unitigs")
+        unitig_list = self.fermi_assemble(fermiMinOverlap, opt_flag=0x80)
+        if len(unitig_list) == 0:
+            return 0
+
+        n_unitigs_0 = -1
+        n_unitigs_1 = len(unitig_list)
+        # print " # of Unitigs:", n_unitigs_1
+        # print " Map reads to unitigs"
+        (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list)
+
+        self.verify_alns(unitig_list,
+                         unitig_alns,
+                         reference_alns,
+                         aln_scores,
+                         markup_alns)
+        if len(unitig_list) == 0:
+            # if stop here, it raises a flag that the region may
+            # contain too many mismapped reads, we return -1
+            return -1
+        # print (" # of Unitigs:", n_unitigs_1)
+
+        # assign RAs to unitigs
+        [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list)
+        # prunmapped_ra_collection: cython.int.get_FASTQ().decode()
+
+        # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C)
+
+        while len(unmapped_RAlist_T) > 0 and n_unitigs_1 != n_unitigs_0:
+            # if there are unmapped reads AND we can get more unitigs
+            # from last round of assembly, do assembly again
+
+            # print (" # of RAs not mapped, will be assembled again:", n_unmapped)
+            n_unitigs_0 = n_unitigs_1
+            # another round of assembly
+            unmapped_ra_collection = RACollection(self.chrom,
+                                                  self.peak,
+                                                  unmapped_RAlist_T,
+                                                  unmapped_RAlist_C)
+            unitigs_2nd = unmapped_ra_collection.fermi_assemble(fermiMinOverlap,
+                                                                opt_flag=0x80)
+
+            if unitigs_2nd:
+                unitig_list = self.add_to_unitig_list(unitig_list, unitigs_2nd)
+                n_unitigs_1 = len(unitig_list)
+                # print " # of Unitigs:", n_unitigs_1
+                # print " Map reads to unitigs"
+                (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list)
+                self.verify_alns(unitig_list,
+                                 unitig_alns,
+                                 reference_alns,
+                                 aln_scores,
+                                 markup_alns)
+                [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list)
+                # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C)
+            # else:
+            #    for r in unmapped_RAlist_T:
+            #        prr: cython.int.get_FASTQ().decode().lstrip()
+
+            # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped)
+            # another round of assembly
+            unmapped_ra_collection = RACollection(self.chrom,
+                                                  self.peak,
+                                                  unmapped_RAlist_T,
+                                                  unmapped_RAlist_C)
+            unitigs_2nd = unmapped_ra_collection.fermi_assemble(fermiMinOverlap/2,
+                                                                opt_flag=0x80)
+
+            if unitigs_2nd:
+                unitig_list = self.add_to_unitig_list(unitig_list, unitigs_2nd)
+                n_unitigs_1 = len(unitig_list)
+                # print " # of Unitigs:", n_unitigs_1
+                # print " Map reads to unitigs"
+                (unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ(unitig_list)
+                self.verify_alns(unitig_list,
+                                 unitig_alns,
+                                 reference_alns,
+                                 aln_scores,
+                                 markup_alns)
+                [RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C] = self.remap_RAs_w_unitigs(unitig_list)
+                # n_unmapped = len(unmapped_RAlist_T) + len(unmapped_RAlist_C)
+            # else:
+            #    for r in unmapped_RAlist_T:
+            #        prr: cython.int.get_FASTQ().decode().lstrip()
+            if len(unitig_list) == 0:
+                raise Exception("Shouldn't reach here")
+            # print (" # of Unitigs:", n_unitigs_1)
+
+        if len(unitig_list) == 0:
+            return None
+        # print (" Final round: # of Unitigs:", len(unitig_list))
+        # print (" Final round: # of RAs not mapped:", n_unmapped)
+
+        start = min(self.left, self.RAs_left)
+        end = max(self.right, self.RAs_right)
+
+        # create UnitigCollection
+        for i in range(len(unitig_list)):
+            #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------'
+            #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT'
+            tmp_unitig_aln = unitig_alns[i]
+            tmp_reference_aln = reference_alns[i]
+            # tmp_unitig_seq = tmp_unitig_aln.replace(b'-',b'')
+            tmp_reference_seq = tmp_reference_aln.replace(b'-', b'')
+
+            # prtmp_unitig_aln: cython.int
+            # prtmp_reference_aln: cython.int
+            # prtmp_unitig_seq: cython.int
+            # prtmp_reference_aln: cython.int
+
+            # find the position on self.peak_refseq_ext
+            left_padding_ref = self.peak_refseq_ext.find(tmp_reference_seq)  # this number of nts should be skipped on refseq_ext from left
+            right_padding_ref = len(self.peak_refseq_ext) - left_padding_ref - len(tmp_reference_seq)  # this number of nts should be skipped on refseq_ext from right
+
+            # now, decide the lpos and rpos on reference of this unitig
+            # first, trim left padding '-'
+            left_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.lstrip(b'-'))
+            right_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.rstrip(b'-'))
+
+            tmp_lpos = start + left_padding_ref
+            tmp_rpos = end - right_padding_ref
+
+            for j in range(left_padding_unitig):
+                if tmp_reference_aln[j] != b'-':
+                    tmp_lpos += 1
+            for j in range(1, right_padding_unitig + 1):
+                if tmp_reference_aln[-j] != b'-':
+                    tmp_rpos -= 1
+
+            tmp_unitig_aln = tmp_unitig_aln[left_padding_unitig:(len(tmp_unitig_aln)-right_padding_unitig)]
+            tmp_reference_aln = tmp_reference_aln[left_padding_unitig:(len(tmp_reference_aln)-right_padding_unitig)]
+
+            ura_list.append(UnitigRAs(self.chrom, tmp_lpos, tmp_rpos, tmp_unitig_aln, tmp_reference_aln, [RAlists_T[i], RAlists_C[i]]))
+
+        return UnitigCollection(self.chrom, self.peak, ura_list)
diff --git a/MACS3/Signal/RACollection.pyx b/MACS3/Signal/RACollection.pyx
deleted file mode 100644
index c0650dee..00000000
--- a/MACS3/Signal/RACollection.pyx
+++ /dev/null
@@ -1,898 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2021-03-10 23:39:52 Tao Liu>
-
-"""Module for SAPPER BAMParser class
-
-Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included
-with the distribution).
-
-@status:  experimental
-@version: $Revision$
-@author:  Tao Liu
-@contact: tliu4@buffalo.edu
-"""
-# ------------------------------------
-# python modules
-# ------------------------------------
-import struct
-from collections import Counter
-from operator import itemgetter
-from copy import copy
-
-from MACS3.Signal.ReadAlignment import ReadAlignment
-from MACS3.Signal.PosReadsInfo import PosReadsInfo
-from MACS3.Signal.UnitigRACollection import UnitigRAs, UnitigCollection
-from MACS3.IO.PeakIO import PeakIO
-
-from cpython cimport bool
-from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free
-
-import numpy as np
-cimport numpy as np
-from numpy cimport uint32_t, uint64_t, int32_t, int64_t
-
-from libc.stdlib cimport malloc, free, realloc
-
-cdef extern from "stdlib.h":
-    ctypedef unsigned int size_t
-    size_t strlen(char *s)
-    #void *malloc(size_t size)
-    void *calloc(size_t n, size_t size)
-    #void free(void *ptr)
-    int strcmp(char *a, char *b)
-    char * strcpy(char *a, char *b)
-    long atol(char *bytes)
-    int atoi(char *bytes)
-
-
-# --- fermi-lite functions ---
-#define MAG_F_AGGRESSIVE 0x20 // pop variant bubbles (not default)
-#define MAG_F_POPOPEN    0x40 // aggressive tip trimming (default)
-#define MAG_F_NO_SIMPL   0x80 // skip bubble simplification (default)
-
-cdef extern from "fml.h":
-    ctypedef struct bseq1_t:
-        int32_t l_seq
-        char *seq
-        char *qual # NULL-terminated strings; length expected to match $l_seq
-
-    ctypedef struct magopt_t:
-        int flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth
-        float min_dratio1, max_bcov, max_bfrac
-
-    ctypedef struct fml_opt_t:
-        int n_threads        # number of threads; don't use multi-threading for small data sets
-        int ec_k             # k-mer length for error correction; 0 for auto estimate
-        int min_cnt, max_cnt # both occ threshold in ec and tip threshold in cleaning lie in [min_cnt,max_cnt]
-        int min_asm_ovlp     # min overlap length during assembly
-        int min_merge_len    # during assembly, don't explicitly merge an overlap if shorter than this value
-        magopt_t mag_opt     # graph cleaning options
-
-    ctypedef struct fml_ovlp_t:
-        uint32_t len_, from_, id_, to_ 
-        #unit32_t from  # $from and $to: 0 meaning overlapping 5'-end; 1 overlapping 3'-end
-        #uint32_t id
-        #uint32_t to    # $id: unitig number
-
-    ctypedef struct fml_utg_t:
-        int32_t len      # length of sequence
-        int32_t nsr      # number of supporting reads
-        char *seq        # unitig sequence
-        char *cov        # cov[i]-33 gives per-base coverage at i
-        int n_ovlp[2]    # number of 5'-end [0] and 3'-end [1] overlaps
-        fml_ovlp_t *ovlp # overlaps, of size n_ovlp[0]+n_ovlp[1]
-
-    void fml_opt_init(fml_opt_t *opt)
-    fml_utg_t* fml_assemble(const fml_opt_t *opt, int n_seqs, bseq1_t *seqs, int *n_utg)
-    void fml_utg_destroy(int n_utg, fml_utg_t *utg)
-    void fml_utg_print(int n_utgs, const fml_utg_t *utg)
-    bseq1_t *bseq_read(const char *fn, int *n)
-
-# --- end of fermi-lite functions ---
-
-# --- smith-waterman alignment functions ---
-
-cdef extern from "swalign.h":
-    ctypedef struct seq_pair_t:
-        char *a
-        unsigned int alen
-        char *b
-        unsigned int blen
-    ctypedef struct align_t:
-        seq_pair_t *seqs
-        char *markup;
-        int start_a
-        int start_b
-        int end_a
-        int end_b
-        int matches
-        int gaps
-        double score
-    align_t *smith_waterman(seq_pair_t *problem)
-    void destroy_seq_pair(seq_pair_t *pair)
-    void destroy_align(align_t *ali)
-    
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "Parser $Revision$"
-__author__ = "Tao Liu <tliu4@buffalo.edu>"
-__doc__ = "All Parser classes"
-
-__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
-
-__CIGARCODE__ = "MIDNSHP=X"
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-
-cdef class RACollection:
-    """A collection of ReadAlignment objects and the corresponding
-    PeakIO.
-
-    """
-    cdef:
-        bytes chrom
-        object peak             # A PeakIO object
-        list RAlists           # contain ReadAlignment lists for treatment (0) and control (1)
-        long left               # left position of peak
-        long right              # right position of peak
-        long length             # length of peak
-        long RAs_left           # left position of all RAs in the collection
-        long RAs_right          # right position of all RAs in the collection
-        bool sorted             # if sorted by lpos
-        bytes peak_refseq       # reference sequence in peak region b/w left and right
-        bytes peak_refseq_ext   # reference sequence in peak region with extension on both sides b/w RAs_left and RAs_right
-
-    def __init__ ( self, chrom, peak, RAlist_T, RAlist_C=[] ):
-        """Create RACollection object by taking:
-
-        1. peak: a PeakIO object indicating the peak region.
-
-        2. RAlist: a python list of ReadAlignment objects containing
-        all the reads overlapping the peak region. If no RAlist_C
-        given, it will be [].
-
-        """
-        if len(RAlist_T) == 0:
-            # no reads, return None
-            raise Exception("No reads from ChIP sample to construct RAcollection!")
-        self.chrom = chrom
-        self.peak = peak
-        #print(len(RAlist_T),"\n")
-        #print(len(RAlist_C),"\n")        
-        self.RAlists = [ RAlist_T, RAlist_C ]
-        self.left = peak["start"]
-        self.right = peak["end"]
-        self.length =  self.right - self.left
-        if RAlist_T:
-            self.RAs_left = RAlist_T[0]["lpos"] # initial assignment of RAs_left
-            self.RAs_right = RAlist_T[-1]["rpos"] # initial assignment of RAs_right
-            self.sort()                           # it will set self.sorted = True
-        else:
-            self.RAs_left = -1
-            self.RAs_right = -1
-        # check RAs_left and RAs_right
-        for ra in RAlist_T:
-            if ra[ "lpos" ] < self.RAs_left:
-                self.RAs_left = ra[ "lpos" ]
-            if ra[ "rpos" ] > self.RAs_right:
-                self.RAs_right = ra[ "rpos" ]
-
-        for ra in RAlist_C:
-            if ra[ "lpos" ] < self.RAs_left:
-                self.RAs_left = ra[ "lpos" ]
-            if ra[ "rpos" ] > self.RAs_right:
-                self.RAs_right = ra[ "rpos" ]
-        (self.peak_refseq, self.peak_refseq_ext) = self.__get_peak_REFSEQ()
-
-    def __getitem__ ( self, keyname ):
-        if keyname == "chrom":
-            return self.chrom
-        elif keyname == "left":
-            return self.left
-        elif keyname == "right":
-            return self.right
-        elif keyname == "RAs_left":
-            return self.RAs_left
-        elif keyname == "RAs_right":
-            return self.RAs_right	    
-        elif keyname == "length":
-            return self.length
-        elif keyname == "count":
-            return len( self.RAlists[ 0 ] )+ len( self.RAlists[ 1 ] )
-        elif keyname == "count_T":
-            return len( self.RAlists[ 0 ] )
-        elif keyname == "count_C":
-            return len( self.RAlists[ 1 ] )
-        elif keyname == "peak_refseq":
-            return self.peak_refseq
-        elif keyname == "peak_refseq_ext":
-            return self.peak_refseq_ext
-        else:
-            raise KeyError("Unavailable key:", keyname)
-
-    def __getstate__ ( self ):
-        #return {"chrom":self.chrom, "peak":self.peak, "RAlists":self.RAlists,
-        #        "left":self.left, "right":self.right, "length": self.length,
-        #        "RAs_left":self.RAs_left, "RAs_right":self.RAs_right}
-        return (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext)
-        
-    def __setstate__ ( self, state ):
-        (self.chrom, self.peak, self.RAlists, self.left, self.right, self.length, self.RAs_left, self.RAs_right, self.peak_refseq, self.peak_refseq_ext) = state
-        
-    cpdef sort ( self ):
-        """Sort RAs according to lpos. Should be used after realignment.
-
-        """
-        if self.RAlists[ 0 ]:
-            self.RAlists[ 0 ].sort(key=itemgetter("lpos"))
-        if self.RAlists[ 1 ]:
-            self.RAlists[ 1 ].sort(key=itemgetter("lpos"))
-        self.sorted = True
-        return
-        
-    cpdef remove_outliers ( self, int percent = 5 ):
-        """ Remove outliers with too many n_edits. The outliers with
-        n_edits in top p% will be removed.
-
-        Default: remove top 5% of reads that have too many differences
-        with reference genome.
-        """
-        cdef:
-            list n_edits_list
-            object read         # ReadAlignment object
-            int highest_n_edits
-            list new_RAlist
-            int i
-
-        n_edits_list = []
-        for ralist in self.RAlists:
-            for read in ralist:
-                n_edits_list.append( read["n_edits"] )
-        n_edits_list.sort()
-        highest_n_edits = n_edits_list[ int( len( n_edits_list ) * (1 - percent * .01) ) ]
-
-        for i in ( range(len(self.RAlists)) ):
-            new_RAlist = []
-            for read in self.RAlists[ i ]:
-                if read["n_edits"] <= highest_n_edits:
-                    new_RAlist.append( read )
-            self.RAlists[ i ] = new_RAlist
-
-        return
-
-    cpdef n_edits_sum ( self ):
-        """
-        """
-        cdef:
-            list n_edits_list
-            object read
-            int highest_n_edits
-
-        n_edits_list = []
-
-        for ralist in self.RAlists:
-            for read in ralist:
-                n_edits_list.append( read["n_edits"] )
-
-        n_edits_list.sort()
-        # print ( n_edits_list )
-        c = Counter( n_edits_list )
-        #print( c )
-
-    cdef tuple __get_peak_REFSEQ ( self ):
-        """Get the reference sequence within the peak region.
-
-        """
-        cdef:
-            bytearray peak_refseq
-            int i
-            long prev_r                   #remember the previous filled right end
-            long start
-            long end
-            long ind, ind_r
-            object read
-            bytearray read_refseq_ext
-            bytearray read_refseq
-
-        start = min( self.RAs_left, self.left )
-        end = max( self.RAs_right, self.right )
-        #print ("left",start,"right",end)
-        peak_refseq_ext = bytearray( b'N' * ( end - start ) )
-
-        # for treatment.
-        peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[0] )
-        # and control if available.
-        if self.RAlists[1]:
-            peak_refseq_ext = self.__fill_refseq ( peak_refseq_ext, self.RAlists[1] )
-
-        # trim
-        peak_refseq = peak_refseq_ext[ self.left - start: self.right - start ]
-        return ( bytes( peak_refseq ), bytes( peak_refseq_ext ) )
-
-    cdef bytearray __fill_refseq ( self, bytearray seq, list ralist ):
-        """Fill refseq sequence of whole peak with refseq sequences of
-        each read in ralist.
-
-        """
-        cdef:
-            long prev_r         # previous right position of last
-                                # filled
-            long ind, ind_r
-            long start, end
-            object read
-            bytearray read_refseq
-
-        start = min( self.RAs_left, self.left )
-
-        #print(len(ralist),"\n")
-        prev_r = ralist[0]["lpos"]
-
-        for i in range(  len( ralist ) ):
-            read = ralist[ i ]
-            if read[ "lpos" ] > prev_r:
-                read = ralist[ i - 1 ]
-                read_refseq = read.get_REFSEQ()
-                ind = read["lpos"] - start
-                ind_r = ind + read["rpos"] - read["lpos"]
-                seq[ ind: ind_r  ] = read_refseq
-                prev_r = read[ "rpos" ]
-        # last
-        read = ralist[ -1 ]
-        read_refseq = read.get_REFSEQ()
-        ind = read["lpos"] - start
-        ind_r = ind + read["rpos"] - read["lpos"]
-        seq[ ind: ind_r  ] = read_refseq
-        return seq
-                
-    cpdef get_PosReadsInfo_ref_pos ( self, long ref_pos, bytes ref_nt, int Q=20 ):
-        """Generate a PosReadsInfo object for a given reference genome
-        position.
-
-        Return a PosReadsInfo object.
-
-        """
-        cdef:
-            bytearray s
-            bytearray bq
-            int strand
-            object ra
-            list bq_list_t = []
-            list bq_list_c = []
-            int i
-            int pos
-            bool tip
-
-        posreadsinfo_p = PosReadsInfo( ref_pos, ref_nt )
-
-        #Treatment group
-        for i in range( len( self.RAlists[ 0 ] ) ):
-            ra = self.RAlists[ 0 ][ i ]
-            if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos:
-                ( s, bq, strand, tip, pos) = ra.get_variant_bq_by_ref_pos( ref_pos )
-                posreadsinfo_p.add_T( i, bytes( s ), bq[ 0 ], strand, tip, Q=Q )
-
-        #Control group
-        for i in range( len( self.RAlists[ 1 ] ) ):
-            ra = self.RAlists[ 1 ][ i ]
-            if ra[ "lpos" ] <= ref_pos and ra[ "rpos" ] > ref_pos:
-                ( s, bq, strand, tip, pos ) = ra.get_variant_bq_by_ref_pos( ref_pos )
-                posreadsinfo_p.add_C( i, bytes( s ), bq[ 0 ], strand, Q=Q )
-
-        return posreadsinfo_p
-
-    cpdef bytearray get_FASTQ ( self ):
-        """Get FASTQ file for all reads in RACollection.
-
-        """
-        cdef:
-            object ra
-            bytearray fastq_text
-        
-        fastq_text = bytearray(b"")
-
-        for ra in self.RAlists[0]:
-            fastq_text += ra.get_FASTQ()
-
-        for ra in self.RAlists[1]:
-            fastq_text += ra.get_FASTQ()
-        
-        return fastq_text
-
-    cdef list fermi_assemble( self, int fermiMinOverlap, int opt_flag = 0x80  ):
-        """A wrapper function to call Fermi unitig building functions.
-        """
-        cdef:
-            fml_opt_t *opt
-            int c, n_seqs
-            int * n_utg
-            bseq1_t *seqs
-            fml_utg_t *utg
-            fml_utg_t p
-
-            int unitig_k, merge_min_len
-            bytes tmps
-            bytes tmpq
-            int ec_k = -1
-            int64_t l
-            char * cseq
-            char * cqual
-            int i, j
-            bytes tmpunitig
-            bytes unitig                 #final unitig
-            list unitig_list             # contain list of sequences in bytes format
-            int * n
-            
-        n_seqs = len(self.RAlists[0]) + len(self.RAlists[1])
-
-        # print n_seqs
-
-        # prepare seq and qual, note, we only extract SEQ according to the +
-        # strand of reference sequence.
-        seqs = <bseq1_t *> malloc( n_seqs * sizeof(bseq1_t) ) # we rely on fermi-lite to free this mem
-        
-        i = 0
-        for ra in self.RAlists[0]:
-            tmps = ra["SEQ"]
-            tmpq = ra["QUAL"]
-            l = len(tmps)
-            cseq = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
-            cqual = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
-            for j in range(l):
-                cseq[ j ] = tmps[ j ]
-                cqual[ j ] = tmpq[ j ] + 33
-            cseq[ l ] = b'\x00'
-            cqual[ l ]= b'\x00'
-
-            seqs[ i ].seq = cseq
-            seqs[ i ].qual = cqual
-            seqs[ i ].l_seq = len(tmps)
-            i += 1
-            
-            # print "@",ra["readname"].decode()
-            # print cseq.decode()
-            # print "+"
-            # print cqual.decode()
-
-        for ra in self.RAlists[1]:
-            tmps = ra["SEQ"]
-            tmpq = ra["QUAL"]
-            l = len(tmps)
-            cseq = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
-            cqual = <char *>malloc( (l+1)*sizeof(char))# we rely on fermi-lite to free this mem
-            for j in range(l):
-                cseq[ j ] = tmps[ j ]
-                cqual[ j ] = tmpq[ j ] + 33
-            cseq[ l ] = b'\x00'
-            cqual[ l ]= b'\x00'
-
-            seqs[ i ].seq = cseq
-            seqs[ i ].qual = cqual
-            seqs[ i ].l_seq = len(tmps)
-            i += 1
-            # print "@",ra["readname"].decode()
-            # print cseq.decode()
-            # print "+"
-            # print cqual.decode()
-
-        # if self.RAlists[1]:
-        #     unitig_k=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*fermiOverlapMinRatio)
-
-        #     merge_min_len=int(min(self.RAlists[0][0]["l"],self.RAlists[1][0]["l"])*0.5)
-        # else:
-        #     unitig_k = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
-
-        #     merge_min_len=int(self.RAlists[0][0]["l"]*0.5)
-        #fermiMinOverlap = int(self.RAlists[0][0]["l"]*fermiOverlapMinRatio)
-
-        # minimum overlap to merge, default 0
-        # merge_min_len= max( 25, int(self.RAlists[0][0]["l"]*0.5) )
-        # merge_min_len= int(self.RAlists[0][0]["l"]*0.5)
-
-        opt = <fml_opt_t *> PyMem_Malloc( sizeof(fml_opt_t) )
-        n_utg = <int *> PyMem_Malloc( sizeof(int) )
-
-        fml_opt_init(opt)
-        # k-mer length for error correction (0 for auto; -1 to disable)
-        #opt.ec_k = 0
-
-        # min overlap length during initial assembly
-        opt.min_asm_ovlp = fermiMinOverlap
-
-        # minimum length to merge, during assembly, don't explicitly merge an overlap if shorter than this value
-        # opt.min_merge_len = merge_min_len
-
-        # there are more 'options' for mag clean:
-        # flag, min_ovlp, min_elen, min_ensr, min_insr, max_bdist, max_bdiff, max_bvtx, min_merge_len, trim_len, trim_depth, min_dratio1, max_bcov, max_bfrac
-        # min_elen (300) will be adjusted
-        # min_ensr (4), min_insr (3) will be computed
-        # min_merge_len (0) will be updated using opt.min_merge_len
-
-        # We can adjust: flag (0x40|0x80), min_ovlp (0), min_dratio1 (0.7), max_bdiff (50), max_bdist (512), max_bvtx (64), trim_len (0), trim_depth (6), max_bcov (10.), max_bfrac (0.15)
-
-        # 0x20: MAG_F_AGGRESSIVE pop variant bubbles
-        # 0x40: MAG_F_POPOPEN aggressive tip trimming
-        # 0x80: MAG_F_NO_SIMPL skip bubble simplification
-        opt.mag_opt.flag = opt_flag
-
-        # mag_opt.min_ovlp
-        #opt.mag_opt.min_ovlp = fermiMinOverlap
-
-        # drop an overlap if its length is below maxOvlpLen*FLOAT
-        #opt.mag_opt.min_dratio1 = 0.5
-
-        # retain a bubble if one side is longer than the other side by >INT-bp
-        #opt.mag_opt.max_bdiff = 10#merge_min_len
-
-        # trim_len:
-        # trim_depth: Parameter used to trim the open end/tip. If trim_len == 0, do nothing
-
-        # max_bdist: 
-        # max_bvtx: Parameter used to simply bubble while 0x80 flag is set.
-        #opt.mag_opt.max_bdist = 1024
-        #opt.mag_opt.max_bvtx = 128
-
-        # max_bcov:
-        # max_bfrac: Parameter used when aggressive bubble removal is not used. Bubble will be removed if its average coverage lower than max_bcov and fraction (cov1/(cov1+cov2)) is lower than max_bfrac
-        #opt.mag_opt.max_bcov = 10.
-        #opt.mag_opt.max_bfrac = 0.01
-
-        utg = fml_assemble(opt, n_seqs, seqs, n_utg)
-        # get results
-        unitig_list = []
-        for i in range( n_utg[0] ):
-            p = utg[ i ]
-            if (p.len < 0):
-                continue
-            #unitig = b''
-            #for j in range( p.len ):
-            #    unitig += [b'A',b'C',b'G',b'T',b'N'][int(p.seq[j]) - 1]
-            #unitig_list.append( unitig )
-            unitig_list.append( p.seq )
-
-        fml_utg_destroy(n_utg[0], utg)
-
-        PyMem_Free( opt )
-        PyMem_Free( n_utg )
-
-        return unitig_list
-
-    cdef tuple align_unitig_to_REFSEQ ( self, list unitig_list ):
-        """Note: we use smith waterman, but we don't use linear gap
-        penalty at this time.
-
-        Also, if unitig is mapped to - strand, we will revcomp the
-        unitig. So the unitig_list will be changed in this case.  
-        """
-    
-        cdef:
-            bytes unitig
-            seq_pair_t problem
-            align_t * results
-            char * tmp
-            bytes target
-            bytes reference
-            bytes target_aln_f, target_aln_r
-            bytes reference_aln_f, reference_aln_r
-            bytes markup_aln_f, markup_aln_r
-            double score_f, score_r
-            list target_alns = []
-            list reference_alns = []
-            list markup_alns = []
-            list aln_scores = []
-            int i
-
-        reference = copy(self.peak_refseq_ext+b'\x00')
-
-        for i in range( len(unitig_list) ):
-            unitig = unitig_list[ i ]
-            target = copy(unitig + b'\x00')
-            # we use swalign.c for local alignment (without affine gap
-            # penalty). Will revise later.
-            problem.a = target
-            problem.alen = len( unitig )
-            problem.b = reference
-            problem.blen = len( self.peak_refseq_ext )
-            results = smith_waterman( &problem )
-            target_aln_f = results.seqs.a
-            reference_aln_f = results.seqs.b
-            markup_aln_f = results.markup
-            score_f = results.score
-            free( results.seqs.a )
-            free( results.seqs.b )
-            free( results.markup )
-            free( results )
-            # end of local alignment
-            
-            # try reverse complement
-            target = copy(unitig[::-1] + b'\x00')
-            target = target.translate( __DNACOMPLEMENT__ )
-            problem.a = target
-            problem.alen = len( unitig )
-            problem.b = reference
-            problem.blen = len( self.peak_refseq_ext )
-            results = smith_waterman( &problem )
-            target_aln_r = results.seqs.a
-            reference_aln_r = results.seqs.b
-            markup_aln_r = results.markup
-            score_r = results.score
-            free( results.seqs.a )
-            free( results.seqs.b )
-            free( results.markup )
-            free( results )
-            # end of local alignment
-
-            if score_f > score_r:
-                target_alns.append( target_aln_f )
-                reference_alns.append( reference_aln_f )
-                markup_alns.append( markup_aln_f )
-                aln_scores.append( score_f )
-            else:
-                target_alns.append( target_aln_r )
-                reference_alns.append( reference_aln_r )
-                markup_alns.append( markup_aln_r )                
-                aln_scores.append( score_r )
-                # we will revcomp unitig
-                unitig = unitig[::-1]
-                unitig_list[ i ] = unitig.translate( __DNACOMPLEMENT__ )
-            
-        return ( target_alns, reference_alns, aln_scores, markup_alns )
-
-    cdef verify_alns( self, unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns, float min_score_100 = 150 ):
-        """Remove aln/unitig if it contains too many edits in a small region
-
-        default min score is 150, which means under 2/-3/-5/-2 scoring schema, there are 10 mismatches within 100bps region.
-        """
-        cdef:
-            int i
-        for i in range( len( unitig_list )-1, -1, -1 ):
-            #print i, aln_scores[ i ]
-            #print unitig_alns[ i ]
-            #print markup_alns[ i ]
-            #print reference_alns[ i ]
-            if aln_scores[ i ] * 100 /len( markup_alns[ i ] ) < min_score_100:
-                unitig_list.pop( i )
-                unitig_alns.pop( i )
-                reference_alns.pop( i )
-                aln_scores.pop( i )
-                markup_alns.pop( i )
-        return
-            
-
-    cdef tuple filter_unitig_with_bad_aln ( self, list unitig_list, list target_alns, list reference_alns, float gratio = 0.25  ):
-        """Remove unitigs that has too much gaps (both on target and reference) during alignments. 
-        """
-        pass
-
-    cdef list remap_RAs_w_unitigs ( self, list unitig_list ):
-        """Remap RAs to unitigs, requiring perfect match.
-
-        Return RAlists_T, RAlists_C, unmapped_racollection.
-        """
-        cdef:
-            list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig
-            list RAlists_C = []
-            list unmapped_RAlist_T = [] # list of RAs of ChIP unmappable to unitigs
-            list unmapped_RAlist_C = []
-            #RACollection unmapped_ra_collection
-            int flag = 0
-            int i
-            object tmp_ra
-            bytes tmp_ra_seq, unitig
-        
-        for i in range( len(unitig_list) ):
-            RAlists_T.append([])         # for each unitig, there is another list of RAs
-            RAlists_C.append([])
-
-        # assign RAs to unitigs
-
-        for tmp_ra in self.RAlists[0]:
-            flag = 0
-            tmp_ra_seq = tmp_ra["SEQ"]
-            for i in range( len(unitig_list) ):
-                unitig = unitig_list[ i ]
-                if tmp_ra_seq in unitig:
-                    flag = 1
-                    RAlists_T[ i ].append( tmp_ra )
-                    break
-            if flag == 0:
-                unmapped_RAlist_T.append( tmp_ra )
-                #print "unmapped:", tmp_ra["SEQ"]
-
-        for tmp_ra in self.RAlists[1]:
-            flag = 0
-            tmp_ra_seq = tmp_ra["SEQ"]
-            for i in range( len(unitig_list) ):
-                unitig = unitig_list[ i ]
-                if tmp_ra_seq in unitig:
-                    flag = 1
-                    RAlists_C[ i ].append( tmp_ra )
-                    break
-            if flag == 0:
-                unmapped_RAlist_C.append( tmp_ra )
-                #print "unmapped:", tmp_ra["SEQ"]
-
-        #if unmapped_RAlist_T:
-        #unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
-        return [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ]
-
-    cdef list add_to_unitig_list ( self, unitig_list, unitigs_2nd ):
-        """
-        """
-        cdef:
-            int i,j
-            int flag
-            bytes u0, u1
-            list new_unitig_list
-
-        new_unitig_list = []
-        
-        for i in range( len(unitigs_2nd) ):
-            flag = 0                      # initial value: can't be found in unitig_list
-            u0 = unitigs_2nd[ i ]
-            for j in range( len( unitig_list ) ):
-                u1 = unitig_list[ j ]
-                if u1.find( u0 ) != -1:
-                    flag = 1
-                    break
-                u1 = u1[::-1].translate(__DNACOMPLEMENT__)
-                if u1.find( u0 ) != -1:
-                    flag = 1
-                    break
-            if not flag:
-                new_unitig_list.append( u0 )
-        new_unitig_list.extend( unitig_list )
-        return new_unitig_list
-
-    
-    cpdef object build_unitig_collection ( self, fermiMinOverlap ):
-        """unitig_list and tuple_alns are in the same order!
-
-        return UnitigCollection object.
-
-        """
-        cdef:
-            long start, end
-            list unitigs_2nd
-            bytes u
-            list target_alns, reference_alns, aln_scores, markup_alns
-            list target_alns_2nd, reference_alns_2nd, aln_scores_2nd
-            list RAlists_T = [] # lists of list of RAs of ChIP mapped to each unitig
-            list RAlists_C = []
-            list unmapped_RAlist_T = []
-            list unmapped_RAlist_C = []             
-            bytes tmp_unitig_seq, tmp_reference_seq
-            bytes tmp_unitig_aln, tmp_reference_aln, 
-            int i, j
-            long left_padding_ref, right_padding_ref
-            long left_padding_unitig, right_padding_unitig
-            list ura_list = []
-            RACollection unmapped_ra_collection
-            int flag = 0
-            int n_unmapped, n_unitigs_0, n_unitigs_1
-
-        # first round of assembly
-        # print (" First round to assemble unitigs")
-        unitig_list = self.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 )
-        if len(unitig_list) == 0:
-            return 0
-
-        n_unitigs_0 = -1
-        n_unitigs_1 = len( unitig_list )
-        #print " # of Unitigs:", n_unitigs_1
-        #print " Map reads to unitigs"
-        ( unitig_alns, reference_alns, aln_scores, markup_alns) = self.align_unitig_to_REFSEQ( unitig_list )
-
-        self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
-        if len(unitig_list) == 0:
-            # if stop here, it raises a flag that the region may contain too many mismapped reads, we return -1
-            return -1
-        # print (" # of Unitigs:", n_unitigs_1)
-        
-        # assign RAs to unitigs
-        [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
-        #print unmapped_ra_collection.get_FASTQ().decode()
-
-        n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
-
-        while len( unmapped_RAlist_T ) > 0 and n_unitigs_1 != n_unitigs_0:
-            # if there are unmapped reads AND we can get more unitigs
-            # from last round of assembly, do assembly again
-
-            # print (" # of RAs not mapped, will be assembled again:", n_unmapped)
-            n_unitigs_0 = n_unitigs_1
-            # another round of assembly
-            unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
-            unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap, opt_flag = 0x80 )
-
-            if unitigs_2nd:
-                unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd )
-                n_unitigs_1 = len( unitig_list )
-                #print " # of Unitigs:", n_unitigs_1
-                #print " Map reads to unitigs"
-                ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list )
-                self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
-                [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
-                n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
-            #else:
-            #    for r in unmapped_RAlist_T:
-            #        print r.get_FASTQ().decode().lstrip()
-
-            # print (" # of RAs not mapped, will be assembled again with 1/2 of fermiMinOverlap:", n_unmapped)
-            # another round of assembly
-            unmapped_ra_collection = RACollection( self.chrom, self.peak, unmapped_RAlist_T, unmapped_RAlist_C )
-            unitigs_2nd = unmapped_ra_collection.fermi_assemble( fermiMinOverlap/2, opt_flag = 0x80 )
-
-            if unitigs_2nd:
-                unitig_list = self.add_to_unitig_list ( unitig_list, unitigs_2nd )
-                n_unitigs_1 = len( unitig_list )
-                #print " # of Unitigs:", n_unitigs_1
-                #print " Map reads to unitigs"
-                ( unitig_alns, reference_alns, aln_scores, markup_alns ) = self.align_unitig_to_REFSEQ( unitig_list )
-                self.verify_alns( unitig_list, unitig_alns, reference_alns, aln_scores, markup_alns )
-                [ RAlists_T, RAlists_C, unmapped_RAlist_T, unmapped_RAlist_C ] = self.remap_RAs_w_unitigs( unitig_list )
-                n_unmapped = len( unmapped_RAlist_T ) + len( unmapped_RAlist_C )
-            #else:
-            #    for r in unmapped_RAlist_T:
-            #        print r.get_FASTQ().decode().lstrip()
-            if len(unitig_list) == 0:
-                raise Exception("Shouldn't reach here")
-            # print (" # of Unitigs:", n_unitigs_1)
-
-        if len(unitig_list) == 0:
-            return None
-        #print (" Final round: # of Unitigs:", len(unitig_list))
-        #print (" Final round: # of RAs not mapped:", n_unmapped)
-
-        start = min( self.left, self.RAs_left )
-        end = max( self.right, self.RAs_right )
-
-        # create UnitigCollection
-        for i in range( len( unitig_list ) ):
-            #b'---------------------------AAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAA--------------'
-            #b'GATATAAATAGGATGTTATGAGTTTTCAAATAATTTTATGTCCTTCAGTACAAAAAGCAGTTTCAACTAAAACCCAGTAACAAGCTAGCAATTCCTTTTAAATGGTGCTACTTCAAGCTGCAGCCAGGTAGCTTTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCTGCAGAAACTTCTTAAATCGTCTGTGTTTGGGACTCGTGGGGCCCCACAGGGCTTTACAAGGGCTTTTTAATTTCCAAAAACATAAAACAAAAAAAAATACAAATGTATT'
-            tmp_unitig_aln = unitig_alns[ i ]
-            tmp_reference_aln = reference_alns[ i ]
-            tmp_unitig_seq = tmp_unitig_aln.replace(b'-',b'')
-            tmp_reference_seq = tmp_reference_aln.replace(b'-',b'')
-
-            # print tmp_unitig_aln
-            # print tmp_reference_aln
-            # print tmp_unitig_seq
-            # print tmp_reference_aln
-
-            # find the position on self.peak_refseq_ext
-            left_padding_ref = self.peak_refseq_ext.find( tmp_reference_seq ) # this number of nts should be skipped on refseq_ext from left
-            right_padding_ref = len(self.peak_refseq_ext) - left_padding_ref - len(tmp_reference_seq) # this number of nts should be skipped on refseq_ext from right
-            
-            #now, decide the lpos and rpos on reference of this unitig
-            #first, trim left padding '-'
-            left_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.lstrip(b'-'))
-            right_padding_unitig = len(tmp_unitig_aln) - len(tmp_unitig_aln.rstrip(b'-'))
-
-            tmp_lpos = start + left_padding_ref
-            tmp_rpos = end - right_padding_ref
-
-            for j in range( left_padding_unitig ):
-                if tmp_reference_aln[ j ] != b'-':
-                    tmp_lpos += 1
-            for j in range( 1, right_padding_unitig + 1 ):
-                if tmp_reference_aln[ -j ] != b'-':
-                    tmp_rpos -= 1
-
-            tmp_unitig_aln = tmp_unitig_aln[ left_padding_unitig:(len(tmp_unitig_aln)-right_padding_unitig)]
-            tmp_reference_aln = tmp_reference_aln[ left_padding_unitig:(len(tmp_reference_aln)-right_padding_unitig)]
-
-            ura_list.append( UnitigRAs( self.chrom, tmp_lpos, tmp_rpos, tmp_unitig_aln, tmp_reference_aln, [RAlists_T[i], RAlists_C[i]] ) )
-
-        return UnitigCollection( self.chrom, self.peak, ura_list )
diff --git a/MACS3/Signal/ReadAlignment.pyx b/MACS3/Signal/ReadAlignment.py
similarity index 54%
rename from MACS3/Signal/ReadAlignment.pyx
rename to MACS3/Signal/ReadAlignment.py
index d5376350..b174ba9e 100644
--- a/MACS3/Signal/ReadAlignment.pyx
+++ b/MACS3/Signal/ReadAlignment.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2021-03-10 16:21:51 Tao Liu>
+# Time-stamp: <2024-10-22 15:19:55 Tao Liu>
 
 """Module for SAPPER ReadAlignment class
 
@@ -12,18 +12,19 @@
 # ------------------------------------
 # python modules
 # ------------------------------------
-from cpython cimport bool
-
-cdef extern from "stdlib.h":
-    ctypedef unsigned int size_t
-    size_t strlen(char *s)
-    void *malloc(size_t size)
-    void *calloc(size_t n, size_t size)
-    void free(void *ptr)
-    int strcmp(char *a, char *b)
-    char * strcpy(char *a, char *b)
-    long atol(char *bytes)
-    int atoi(char *bytes)
+import cython
+from cython.cimports.cpython import bool
+
+# cdef extern from "stdlib.h":
+#     ctypedef unsigned int size_t
+#     size_t strlen(char *s)
+#     void *malloc(size_t size)
+#     void *calloc(size_t n, size_t size)
+#     void free(void *ptr)
+#     int strcmp(char *a, char *b)
+#     char * strcpy(char *a, char *b)
+#     long atol(char *bytes)
+#     int atoi(char *bytes)
 
 # ------------------------------------
 # constants
@@ -53,31 +54,35 @@
 # ------------------------------------
 # Classes
 # ------------------------------------
-cdef class ReadAlignment:
-    cdef:
-        bytes readname
-        bytes chrom
-        int lpos
-        int rpos
-        int strand              # strand information. 0 means forward strand, 1 means reverse strand.
-        bytes binaryseq
-        bytes binaryqual
-        int l                   # length of read
-        tuple cigar             # each item contains op_l|op
-        bytes MD
-        int n_edits             # number of edits; higher the number,
-                                # more differences with reference
-        bytes SEQ               # sequence of read regarding to + strand
-        bytes QUAL              # quality of read regarding to + strand
-
-    def __init__ ( self,
-                   bytes readname,
-                   bytes chrom, int lpos, int rpos,
-                   int strand,
-                   bytes binaryseq, 
-                   bytes binaryqual,
-                   tuple cigar,
-                   bytes MD ):
+
+
+@cython.cclass
+class ReadAlignment:
+    readname: bytes
+    chrom: bytes
+    lpos: cython.int
+    rpos: cython.int
+    # strand information. 0 means forward strand, 1 means reverse strand.
+    strand: cython.int
+    binaryseq: bytes
+    binaryqual: bytes
+    l: cython.int                   # length of read
+    cigar: tuple             # each item contains op_l|op
+    MD: bytes
+    # number of edits; higher the number, more differences with reference
+    n_edits: cython.int
+    SEQ: bytes               # sequence of read regarding to + strand
+    QUAL: bytes              # quality of read regarding to + strand
+
+    def __init__(self,
+                 readname: bytes,
+                 chrom: bytes,
+                 lpos: cython.int, rpos: cython.int,
+                 strand: cython.int,
+                 binaryseq: bytes,
+                 binaryqual: bytes,
+                 cigar: tuple,
+                 MD: bytes):
         self.readname = readname
         self.chrom = chrom
         self.lpos = lpos
@@ -85,34 +90,36 @@ def __init__ ( self,
         self.strand = strand
         self.binaryseq = binaryseq
         self.binaryqual = binaryqual
-        self.l = len( binaryqual )
+        self.l = len(binaryqual)
         self.cigar = cigar
         self.MD = MD
         self.n_edits = self.get_n_edits()
         (self.SEQ, self.QUAL) = self.__get_SEQ_QUAL()
 
-    cdef int get_n_edits( self ):
+    @cython.cfunc
+    def get_n_edits(self) -> cython.int:
         """The number is from self.cigar and self.MD.
 
         """
-        cdef:
-            int n_edits
-            int i, cigar_op, cigar_op_l
-            char c
-        
+        n_edits: cython.int
+        i: cython.int
+        cigar_op: cython.int
+        cigar_op_l: cython.int
+        c: cython.char
+
         n_edits = 0
         for i in self.cigar:    # only count insertion or softclip
             cigar_op = i & 15
             cigar_op_l = i >> 4
-            if cigar_op in [ 1, 4 ]:    # count Insertion or Softclip
+            if cigar_op in [1, 4]:    # count Insertion or Softclip
                 n_edits += cigar_op_l
-        
+
         for c in self.MD:
-            if (c > 64 and c < 91): # either deletion in query or mismatch
+            if (c > 64 and c < 91):  # either deletion in query or mismatch
                 n_edits += 1
         return n_edits
 
-    def __str__ ( self ):
+    def __str__(self):
         c = self.chrom.decode()
         n = self.readname.decode()
         if self.strand:
@@ -121,7 +128,7 @@ def __str__ ( self ):
             s = "+"
         return f"{c}\t{self.lpos}\t{self.rpos}\t{n}\t{self.l}\t{s}"
     
-    def __getitem__ ( self, keyname ):
+    def __getitem__(self, keyname):
         if keyname == "readname":
             return self.readname
         elif keyname == "chrom":
@@ -151,142 +158,128 @@ def __getitem__ ( self, keyname ):
         else:
             raise KeyError("No such key", keyname)
 
-    def __getstate__ ( self ):
-        return ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL )
-
-    def __setstate__ ( self, state ):
-        ( self.readname, self.chrom, self.lpos, self.rpos, self.strand, self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD, self.n_edits, self.SEQ, self.QUAL ) = state
-
-    # cpdef bytearray get_SEQ ( self ):
-    #     """Convert binary seq to ascii seq.
-
-    #     Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
+    def __getstate__(self):
+        return (self.readname, self.chrom, self.lpos, self.rpos, self.strand,
+                self.binaryseq, self.binaryqual, self.l, self.cigar,
+                self.MD, self.n_edits, self.SEQ, self.QUAL)
 
-    #     Note: In BAM, if a sequence is mapped to reverse strand, the
-    #     reverse complement seq is written in SEQ field. So the return
-    #     value of this function will not be the original one if the
-    #     read is mapped to - strand.
-    #     """
-    #     cdef:
-    #         char c
-    #         bytearray seq
+    def __setstate__(self, state):
+        (self.readname, self.chrom, self.lpos, self.rpos, self.strand,
+         self.binaryseq, self.binaryqual, self.l, self.cigar, self.MD,
+         self.n_edits, self.SEQ, self.QUAL) = state
 
-    #     seq = bytearray(b"")
-    #     for c in self.binaryseq:
-    #         # high
-    #         seq.append( __BAMDNACODE__[c >> 4 & 15] )
-    #         # low
-    #         seq.append( __BAMDNACODE__[c & 15] )
-    #     if seq[-1] == b"=":
-    #         # trim the last '=' if it exists
-    #         seq = seq[:-1]
-    #     return seq
+    @cython.cfunc
+    def __get_SEQ_QUAL(self) -> tuple:
+        """Get of: tuple (SEQ, QUAL).
 
-    cdef tuple __get_SEQ_QUAL ( self ):
-        """Get tuple of (SEQ, QUAL).
-
-        Rule: for each byte, 1st base in the highest 4bit; 2nd in the lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
+        Rule: for each byte, 1st base in the highest 4bit; 2nd in the
+        lowest 4bit. "=ACMGRSVTWYHKDBN" -> [0,15]
 
         Note: In BAM, if a sequence is mapped to reverse strand, the
         reverse complement seq is written in SEQ field. So the return
         value of this function will not be the original one if the
         read is mapped to - strand. If you need to original one, do
         reversecomp for SEQ and reverse QUAL.
+
         """
-        cdef:
-            int i
-            char c
-            bytearray seq
-            bytearray qual
+        i: cython.int
+        c: cython.char
+        seq: bytearray
+        qual: bytearray
 
         seq = bytearray(b"")
         qual = bytearray(b"")
 
-        for i in range( len(self.binaryseq) ):
-            c = self.binaryseq[ i ]
+        for i in range(len(self.binaryseq)):
+            c = self.binaryseq[i]
             # high
-            seq.append( __BAMDNACODE__[c >> 4 & 15] )
+            seq.append(__BAMDNACODE__[c >> 4 & 15])
             # low
-            seq.append( __BAMDNACODE__[c & 15] )
+            seq.append(__BAMDNACODE__[c & 15])
+
+        for i in range(len(self.binaryqual)):
+            # qual is the -10log10 p or phred score.
+            qual.append(self.binaryqual[i])
 
-        for i in range( len( self.binaryqual ) ):
-            # qual is the -10log10 p or phred score. 
-            qual.append( self.binaryqual[i] )
-            
         if seq[-1] == b"=":
             # trim the last '=' if it exists
             seq = seq[:-1]
-        assert len( seq ) == len( qual ), Exception("Lengths of seq and qual are not consistent!")
+        assert len(seq) == len(qual), Exception("Lengths of seq and qual are not consistent!")
 
         # Example on how to get original SEQ and QUAL:
-        #if self.strand:
+        # if self.strand:
         #    seq.reverse()
         #    #compliment
-        #    seq = seq.translate( __DNACOMPLEMENT__ )
+        #    seq = seq.translate(__DNACOMPLEMENT__)
         #    qual.reverse()
 
-        return ( bytes(seq), bytes(qual) )
+        return (bytes(seq), bytes(qual))
 
-    
-    cpdef bytes get_FASTQ ( self ):
+    @cython.ccall
+    def get_FASTQ(self) -> bytes:
         """Get FASTQ format text.
 
         """
-        cdef:
-            bytes seq
-            bytearray qual
+        seq: bytes
+        qual: bytearray
 
         seq = self.SEQ
         qual = bytearray(self.QUAL)
 
-        for i in range( len( self.QUAL ) ):
-            # qual is the -10log10 p or phred score, to make FASTQ, we have to add 33
-            qual[ i ] += 33
-        
+        for i in range(len(self.QUAL)):
+            # qual is the -10log10 p or phred score, to make FASTQ, we
+            # have to add 33
+            qual[i] += 33
+
         # reverse while necessary
         if self.strand:
             seq = self.SEQ[::-1]
-            #compliment
-            seq = seq.translate( __DNACOMPLEMENT__ )
+            # compliment
+            seq = seq.translate(__DNACOMPLEMENT__)
             qual = qual[::-1]
         else:
             seq = self.SEQ
 
         return b"@" + self.readname + b"\n" + seq + b"\n+\n" + qual + b"\n"
 
-    cpdef bytearray get_REFSEQ ( self ):
+    @cython.ccall
+    def get_REFSEQ(self) -> bytearray:
         """Fetch reference sequence, using self.MD and self.cigar
         """
-        cdef:
-            char c
-            bytearray seq, refseq
-            int i, cigar_op, cigar_op_l
-            bytearray MD_op
-            int ind
-            bool flag_del       # flag for deletion event in query
-
-        seq = bytearray(self.SEQ)    # we start with read seq then make modifications
+        c: cython.char
+        seq: bytearray
+        i: cython.int
+        cigar_op: cython.int
+        cigar_op_l: cython.int
+        MD_op: bytearray
+        ind: cython.int
+        flag_del: bool       # flag for deletion event in query
+
+        # we start with read seq then make modifications
+        seq = bytearray(self.SEQ)
 
         # 2-step proces
-        # First step: use CIGAR to edit SEQ to remove S (softclip) and I (insert)
+        # First step: use CIGAR to edit SEQ to remove S (softclip) and
+        # I (insert)
         # __CIGARCODE__ = "MIDNSHP=X"
         # let ind be the index in SEQ
         ind = 0
         for i in self.cigar:
             cigar_op = i & 15
             cigar_op_l = i >> 4
-            if cigar_op in [2, 5, 6]:     # do nothing for Deletion (we will
-                                          # put sequence back in step 2),
-                                          # Hardclip and Padding
+            if cigar_op in [2, 5, 6]:
+                # do nothing for Deletion (we will
+                # put sequence back in step 2),
+                # Hardclip and Padding
                 pass
-            elif cigar_op in [0, 7, 8]:   # M = X alignment match (match or
-                                          # mismatch)
-                # do nothing and move ind
+            elif cigar_op in [0, 7, 8]:
+                # M = X alignment match (match or mismatch) do nothing
+                # and move ind
                 ind += cigar_op_l
-            elif cigar_op in [ 1, 4 ]:    # Remove for Insertion or Softclip
-                seq[ ind : ind + cigar_op_l ] = b''
+            elif cigar_op in [1, 4]:    # Remove for Insertion or Softclip
+                seq[ind: ind + cigar_op_l] = b''
 
-        # now the seq should be at the same length as rpos-lpos 
+        # now the seq should be at the same length as rpos-lpos
 
         # Second step: use MD string to edit SEQ to put back 'deleted
         # seqs' and modify mismatches
@@ -305,12 +298,12 @@ def __setstate__ ( self, state ):
                 # right, a mismatch should only be 1 letter surrounded
                 # by digits.
                 ind += int(MD_op)
-                seq[ ind ] = c
+                seq[ind] = c
                 ind += 1
                 # reset MD_op
                 MD_op = bytearray(b'')
             elif (c > 64 and c < 91) and flag_del:
-                seq[ ind:ind ] = [c,]
+                seq[ind:ind] = [c,]
                 ind += 1
             elif c == 94:
                 # means Deletion in query. Now, insert a sequnce into
@@ -321,62 +314,74 @@ def __setstate__ ( self, state ):
                 MD_op = bytearray(b'')
             else:
                 raise Exception("Don't understand this operator in MD: %c" % c)
-            #print( seq.decode() )
+            # print(seq.decode())
 
         return seq
-    
-    cpdef get_base_by_ref_pos ( self, long ref_pos ):
+
+    @cython.ccall
+    def get_base_by_ref_pos(self, ref_pos: cython.long):
         """Get base by ref position.
 
         """
-        cdef:
-           int relative_pos, p
+        relative_pos: cython.int
+        p: cython.int
+
         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
         relative_pos = ref_pos - self.lpos
-        p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) 
+        p = self.relative_ref_pos_to_relative_query_pos(relative_pos)
 
         if p == -1:             # located in a region deleted in query
             return None
         else:
-            return __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ]
+            return __BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15]
 
-    cpdef get_bq_by_ref_pos ( self, long ref_pos ):
+    @cython.ccall
+    def get_bq_by_ref_pos(self, ref_pos: cython.long):
         """Get base quality by ref position. Base quality is in Phred scale.
 
         Returned value is the raw Phred-scaled base quality.
 
         """
-        cdef:
-           int relative_pos, p
+        relative_pos: cython.int
+        p: cython.int
+
         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
         relative_pos = ref_pos - self.lpos
-        p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) 
+        p = self.relative_ref_pos_to_relative_query_pos(relative_pos)
 
         if p == -1:             # located in a region deleted in query
             return None
         else:
             return self.binaryqual[p]
 
-    cpdef tuple get_base_bq_by_ref_pos ( self, long ref_pos ):
-        """Get base and base quality by ref position. Base quality is in Phred scale.
+    @cython.ccall
+    def get_base_bq_by_ref_pos(self, ref_pos: cython.long) -> tuple:
+        """Get base and base quality by ref position. Base quality is
+        in Phred scale.
 
         Returned bq is the raw Phred-scaled base quality.
+
         """
-        cdef:
-           int relative_pos, p
+        relative_pos: cython.int
+        p: cython.int
+
         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
         relative_pos = ref_pos - self.lpos
-        p = self.relative_ref_pos_to_relative_query_pos( relative_pos ) 
+        p = self.relative_ref_pos_to_relative_query_pos(relative_pos)
 
         if p == -1:             # located in a region deleted in query
             return None
         else:
-            return ( __BAMDNACODE__[ (self.binaryseq[p//2] >> ((1-p%2)*4) ) & 15 ], self.binaryqual[p] )
+            return (__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15],
+                    self.binaryqual[p])
 
-    cpdef tuple get_variant_bq_by_ref_pos ( self, long ref_pos ):
-        """Get any variants (different with reference) and base quality by ref position. 
+    @cython.ccall
+    def get_variant_bq_by_ref_pos(self,
+                                  ref_pos: cython.long) -> tuple:
+        """Get any variants (different with reference) and base
+        quality by ref position.
 
-        variants will be 
+        variants will be
 
         1) =, if identical
 
@@ -390,40 +395,43 @@ def __setstate__ ( self, state ):
         Base quality is the raw Phred-scaled base quality.
 
         """
-        cdef:
-           int i, m, n
-           int res, p, op, op_l
-           int pos
-           bool tip
-           bytearray refseq
-           bytes p_refseq, p_seq
-           bytearray seq_array
-           bytearray bq_array
+        i: cython.int
+        m: cython.int
+        n: cython.int
+        res: cython.int
+        p: cython.int
+        op: cython.int
+        op_l: cython.int
+        pos: cython.int
+        tip: bool
+        refseq: bytearray
+        p_refseq: bytes
+        seq_array: bytearray
+        bq_array: bytearray
 
         assert self.lpos <= ref_pos and self.rpos > ref_pos, Exception("Given position out of alignment location")
 
         res = ref_pos - self.lpos          # residue
         p = 0
-        
         refseq = self.get_REFSEQ()
-        p_refseq =  refseq[ res ]
+        # p_refseq = refseq[res]
         # -- CIGAR CODE --
-        #OP BAM  Description
-        #M  0    alignment match (can be a sequence match or mismatch) insertion to the reference
-        #I  1    insertion to the reference
-        #D  2    deletion from the reference
-        #N  3    skipped region from the reference
-        #S  4    soft clipping (clipped sequences present in SEQ)
-        #H  5    hard clipping (clipped sequences NOT present in SEQ)
-        #P  6    padding (silent deletion from padded reference)
-        #=  7    sequence match
-        #X  8    sequence mismatch
-
-        seq_array = bytearray( b'' )
-        bq_array = bytearray( b'' )
-
-        for m in range( len(self.cigar) ):
-            i = self.cigar[ m ]
+        # OP BAM  Description
+        # M  0    alignment match (can be a sequence match or mismatch) insertion to the reference
+        # I  1    insertion to the reference
+        # D  2    deletion from the reference
+        # N  3    skipped region from the reference
+        # S  4    soft clipping (clipped sequences present in SEQ)
+        # H  5    hard clipping (clipped sequences NOT present in SEQ)
+        # P  6    padding (silent deletion from padded reference)
+        # =  7    sequence match
+        # X  8    sequence mismatch
+
+        seq_array = bytearray(b'')
+        bq_array = bytearray(b'')
+
+        for m in range(len(self.cigar)):
+            i = self.cigar[m]
             op = i & 15
             op_l = i >> 4
             if op in [0, 7, 8]:         # M = X alignment match (match or mismatch)
@@ -432,98 +440,101 @@ def __setstate__ ( self, state ):
                     p += res
                     # find the position, now get the ref
                     pos = p
-                    seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
-                    bq_array.append( self.binaryqual[ p ] )
+                    seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15])
+                    bq_array.append(self.binaryqual[p])
                     break
                 elif res == op_l - 1:
                     p += res
                     pos = p
-                    seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
-                    bq_array.append( self.binaryqual[ p ] )
+                    seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15])
+                    bq_array.append(self.binaryqual[p])
                     # now add any insertion later on
                     # get next cigar
-                    if m + 1 == len( self.cigar ):
+                    if m + 1 == len(self.cigar):
                         break
-                    i = self.cigar[ m + 1 ]
+                    i = self.cigar[m + 1]
                     op = i & 15
                     op_l = i >> 4
-                    if op == 1:           #insertion
-                        for n in range( op_l ):
+                    if op == 1:  # insertion
+                        for n in range(op_l):
                             p += 1
-                            seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ] )
-                            bq_array.append( self.binaryqual[ p ] )
-                        #print self.SEQ, seq_array
+                            seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p % 2)*4)) & 15])
+                            bq_array.append(self.binaryqual[p])
+                        # prself: cython.int.SEQ, seq_array
                     break
                 else:
                     # go to the next cigar code
                     p += op_l
                     res -= op_l
-            elif op in [ 2, 3 ]: # D N
+            elif op in [2, 3]:  # D N
                 if res < op_l:
                     # find the position, however ...
-                    # position located in a region in reference that not exists in query
+                    # position located in a region in reference that
+                    # not exists in query
                     pos = p
-                    seq_array.append( b'*' )
-                    bq_array.append( 93 )   #assign 93 for deletion
+                    seq_array.append(b'*')
+                    bq_array.append(93)   # assign 93 for deletion
                     break
                 else:
                     # go to the next cigar code
                     res -= op_l
-            elif op == 1 :      # Insertion
+            elif op == 1:      # Insertion
                 p += op_l
                 # if res == 0:    # no residue left, so return a chunk of inserted sequence
                 #     print "shouldn't run this code"
                 #     # first, add the insertion point
-                #     seq_array = bytearray( b'~' )
-                #     bq_array.append( self.binaryqual[ p ] )
+                #     seq_array = bytearray(b'~')
+                #     bq_array.append(self.binaryqual[p])
                 #     # then add the inserted seq
-                #     for i in range( op_l ):
+                #     for i in range(op_l):
                 #         p += 1
-                #         seq_array.append( __BAMDNACODE__[ (self.binaryseq[ p//2 ] >> ( (1-p%2)*4 ) ) & 15 ]  ) 
-                #         bq_array.append( self.binaryqual[ p ] )
+                #         seq_array.append(__BAMDNACODE__[(self.binaryseq[p//2] >> ((1-p%2)*4)) & 15] )
+                #         bq_array.append(self.binaryqual[p])
                 #     break
                 # else:
                 #     p += op_l
-            elif op == 4 :      # Softclip. If it's Softclip, we'd better not return the extra seq
+            elif op == 4:      # Softclip. If it's Softclip, we'd better not return the extra seq
                 p += op_l
 
         if pos == 0 or pos == self.l - 1:
             tip = True
         else:
             tip = False
-                
-        return ( seq_array, bq_array, self.strand, tip, pos )
+        return (seq_array, bq_array, self.strand, tip, pos)
         # last position ?
-        #raise Exception("Not expected to see this")
+        # raise Exception("Not expected to see this")
 
-    cdef int relative_ref_pos_to_relative_query_pos ( self, long relative_ref_pos ):
+    @cython.cfunc
+    def relative_ref_pos_to_relative_query_pos(self,
+                                               relative_ref_pos: cython.long) -> cython.int:
         """Convert relative pos on ref to pos on query.
         """
-        cdef:
-            int p, res, op, op_l
+        p: cython.int
+        res: cython.int
+        op: cython.int
+        op_l: cython.int
+
         p = 0
         res = relative_ref_pos
-        
+
         for i in self.cigar:
             op = i & 15
             op_l = i >> 4
-            if op in [0, 7, 8]:         # M = X alignment match (match or mismatch)
+            if op in [0, 7, 8]:
+                # M = X alignment match (match or mismatch)
                 if res < op_l:
                     p += res
                     return p
                 else:
                     p += op_l
                     res -= op_l
-            elif op in [ 2, 3 ]: # D N
+            elif op in [2, 3]:  # D N
                 if res < op_l:
-                    # position located in a region in reference that not exists in query
+                    # position located in a region in reference that
+                    # not exists in query
                     return -1
                 else:
                     res -= op_l
-            elif op in [ 1, 4 ]:       # I
+            elif op in [1, 4]:       # I
                 p += op_l
         return p
-
-
-### End ###
-
diff --git a/MACS3/Signal/ScoreTrack.py b/MACS3/Signal/ScoreTrack.py
new file mode 100644
index 00000000..d67349a4
--- /dev/null
+++ b/MACS3/Signal/ScoreTrack.py
@@ -0,0 +1,1852 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-18 15:22:06 Tao Liu>
+
+"""Module for Feature IO classes.
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file LICENSE included with
+the distribution).
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+from functools import reduce
+
+# ------------------------------------
+# MACS3 modules
+# ------------------------------------
+from MACS3.Signal.SignalProcessing import maxima, enforce_peakyness
+from MACS3.Signal.Prob import poisson_cdf
+from MACS3.IO.PeakIO import PeakIO, BroadPeakIO
+
+# ------------------------------------
+# Other modules
+# ------------------------------------
+import cython
+import numpy as np
+import cython.cimports.numpy as cnp
+from cython.cimports.cpython import bool
+from cykhash import PyObjectMap, Float32to32Map
+
+# ------------------------------------
+# C lib
+# ------------------------------------
+from cython.cimports.libc.math import (log10,
+                                       log)
+
+# ------------------------------------
+# constants
+# ------------------------------------
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+
+@cython.inline
+@cython.cfunc
+def int_max(a: cython.int, b: cython.int) -> cython.int:
+    return a if a >= b else b
+
+
+@cython.inline
+@cython.cfunc
+def int_min(a: cython.int, b: cython.int) -> cython.int:
+    return a if a <= b else b
+
+
+LOG10_E: cython.float = 0.43429448190325176
+
+pscore_dict = PyObjectMap()
+
+
+@cython.cfunc
+def get_pscore(observed: cython.int,
+               expectation: cython.float) -> cython.float:
+    """Get p-value score from Poisson test. First check existing
+    table, if failed, call poisson_cdf function, then store the result
+    in table.
+
+    """
+    score: cython.double
+
+    try:
+        return pscore_dict[(observed, expectation)]
+    except KeyError:
+        score = -1 * poisson_cdf(observed,
+                                 expectation,
+                                 False,
+                                 True)
+        pscore_dict[(observed, expectation)] = score
+        return score
+
+
+asym_logLR_dict = PyObjectMap()
+
+
+@cython.cfunc
+def logLR_asym(x: cython.float,
+               y: cython.float) -> cython.float:
+    """Calculate log10 Likelihood between H1 (enriched) and H0 (
+    chromatin bias). Set minus sign for depletion.
+
+    *asymmetric version*
+
+    """
+    s: cython.float
+
+    if (x, y) in asym_logLR_dict:
+        return asym_logLR_dict[(x, y)]
+    else:
+        if x > y:
+            s = (x*(log(x)-log(y))+y-x)*LOG10_E
+        elif x < y:
+            s = (x*(-log(x)+log(y))-y+x)*LOG10_E
+        else:
+            s = 0
+        asym_logLR_dict[(x, y)] = s
+        return s
+
+
+sym_logLR_dict = PyObjectMap()
+
+
+@cython.cfunc
+def logLR_sym(x: cython.float, y: cython.float) -> cython.float:
+    """Calculate log10 Likelihood between H1 (enriched) and H0 (
+    another enriched). Set minus sign for H0>H1.
+
+    * symmetric version *
+
+    """
+    s: cython.float
+
+    if (x, y) in sym_logLR_dict:
+        return sym_logLR_dict[(x, y)]
+    else:
+        if x > y:
+            s = (x*(log(x)-log(y))+y-x)*LOG10_E
+        elif y > x:
+            s = (y*(log(x)-log(y))+y-x)*LOG10_E
+        else:
+            s = 0
+        sym_logLR_dict[(x, y)] = s
+        return s
+
+
+@cython.inline
+@cython.cfunc
+def get_logFE(x: cython.float, y: cython.float) -> cython.float:
+    """ return 100* log10 fold enrichment with +1 pseudocount.
+    """
+    return log10(x/y)
+
+
+@cython.cfunc
+def get_subtraction(x: cython.float, y: cython.float) -> cython.float:
+    """ return subtraction.
+    """
+    return x - y
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class ScoreTrackII:
+    """Class for a container to keep signals of each genomic position,
+    including 1. score, 2. treatment and 2. control pileup.
+
+    It also contains scoring methods and call_peak functions.
+    """
+    # dictionary for data of each chromosome
+    data: dict
+    # length of data array of each chromosome
+    datalength: dict
+    # whether trackline should be saved in bedGraph
+    trackline: bool
+    # seq depth in million of treatment
+    treat_edm: cython.float
+    # seq depth in million of control
+    ctrl_edm: cython.float
+    # method for calculating scores.
+    scoring_method: cython.char
+    # scale to control? scale to treatment? both scale to 1million reads?
+    normalization_method: cython.char
+    # the pseudocount used to calcuate logLR, FE or logFE
+    pseudocount: cython.float
+    # cutoff
+    cutoff: cython.float
+    # save pvalue<->length dictionary
+    pvalue_stat: dict
+
+    def __init__(self,
+                 treat_depth: cython.float,
+                 ctrl_depth: cython.float,
+                 pseudocount: cython.float = 1.0):
+        """Initialize.
+
+        treat_depth and ctrl_depth are effective depth in million:
+                                    sequencing depth in million after
+                                    duplicates being filtered. If
+                                    treatment is scaled down to
+                                    control sample size, then this
+                                    should be control sample size in
+                                    million. And vice versa.
+
+        pseudocount: a pseudocount used to calculate logLR, FE or
+                     logFE. Please note this value will not be changed
+                     with normalization method. So if you really want
+                     to set pseudocount 1 per million reads, set it
+                     after you normalize treat and control by million
+                     reads by `change_normalizetion_method(ord('M'))`.
+
+        """
+        # for each chromosome, there is a l*4 matrix. First column:
+        # end position of a region; Second: treatment pileup; third:
+        # control pileup ; forth: score (can be p/q-value/likelihood
+        # ratio/fold-enrichment/subtraction depending on -c setting)
+        self.data = {}
+
+        self.datalength = {}
+        self.trackline = False
+        self.treat_edm = treat_depth
+        self.ctrl_edm = ctrl_depth
+
+        # scoring_method:  p: -log10 pvalue;
+        #                  q: -log10 qvalue;
+        #                  l: log10 likelihood ratio (minus for depletion)
+        #                  f: log10 fold enrichment
+        #                  F: linear fold enrichment
+        #                  d: subtraction
+        #                  m: fragment pileup per million reads
+        #                  N: not set
+        self.scoring_method = ord("N")
+
+        # normalization_method: T: scale to depth of treatment;
+        #                       C: scale to depth of control;
+        #                       M: scale to depth of 1 million;
+        #                       N: not set/ raw pileup
+        self.normalization_method = ord("N")
+
+        self.pseudocount = pseudocount
+        self.pvalue_stat = {}
+
+    @cython.ccall
+    def set_pseudocount(self, pseudocount: cython.float):
+        self.pseudocount = pseudocount
+
+    @cython.ccall
+    def enable_trackline(self):
+        """Turn on trackline with bedgraph output
+        """
+        self.trackline = True
+
+    @cython.ccall
+    def add_chromosome(self,
+                       chrom: bytes,
+                       chrom_max_len: cython.int):
+        """
+        chrom: chromosome name
+        chrom_max_len: maximum number of data points in this chromosome
+
+        """
+        if chrom not in self.data:
+            self.data[chrom] = [np.zeros(chrom_max_len, dtype="int32"),  # pos
+                                # pileup at each interval, in float32 format
+                                np.zeros(chrom_max_len, dtype="float32"),
+                                # control at each interval, in float32 format
+                                np.zeros(chrom_max_len, dtype="float32"),
+                                # score at each interval, in float32 format
+                                np.zeros(chrom_max_len, dtype="float32")]
+            self.datalength[chrom] = 0
+
+    @cython.ccall
+    def add(self,
+            chromosome: bytes,
+            endpos: cython.int,
+            chip: cython.float,
+            control: cython.float):
+        """Add a chr-endpos-sample-control block into data
+        dictionary.
+
+        chromosome: chromosome name in string
+        endpos    : end position of each interval in integer
+        chip      : ChIP pileup value of each interval in float
+        control   : Control pileup value of each interval in float
+
+        *Warning* Need to add regions continuously.
+        """
+        i: cython.int
+
+        i = self.datalength[chromosome]
+        c = self.data[chromosome]
+        c[0][i] = endpos
+        c[1][i] = chip
+        c[2][i] = control
+        self.datalength[chromosome] += 1
+
+    @cython.ccall
+    def finalize(self):
+        """
+        Adjust array size of each chromosome.
+
+        """
+        chrom: bytes
+        ln: cython.int
+
+        for chrom in sorted(self.data.keys()):
+            d = self.data[chrom]
+            ln = self.datalength[chrom]
+            d[0].resize(ln, refcheck=False)
+            d[1].resize(ln, refcheck=False)
+            d[2].resize(ln, refcheck=False)
+            d[3].resize(ln, refcheck=False)
+        return
+
+    @cython.ccall
+    def get_data_by_chr(self,
+                        chromosome: bytes):
+        """Return array of counts by chromosome.
+
+        The return value is a tuple:
+        ([end pos],[value])
+        """
+        if chromosome in self.data:
+            return self.data[chromosome]
+        else:
+            return None
+
+    @cython.ccall
+    def get_chr_names(self):
+        """Return all the chromosome names stored.
+
+        """
+        return set(self.data.keys())
+
+    @cython.ccall
+    def change_normalization_method(self,
+                                    normalization_method: cython.char):
+        """Change/set normalization method. However, I do not
+        recommend change this back and forward, since some precision
+        issue will happen -- I only keep two digits.
+
+        normalization_method: T: scale to depth of treatment;
+                              C: scale to depth of control;
+                              M: scale to depth of 1 million;
+                              N: not set/ raw pileup
+        """
+        if normalization_method == ord('T'):
+            if self.normalization_method == ord('T'):  # do nothing
+                pass
+            elif self.normalization_method == ord('C'):
+                self.normalize(self.treat_edm/self.ctrl_edm,
+                               self.treat_edm/self.ctrl_edm)
+            elif self.normalization_method == ord('M'):
+                self.normalize(self.treat_edm, self.treat_edm)
+            elif self.normalization_method == ord('N'):
+                self.normalize(1, self.treat_edm/self.ctrl_edm)
+            else:
+                raise NotImplementedError
+            self.normalization_method = ord('T')
+        elif normalization_method == ord('C'):
+            if self.normalization_method == ord('T'):
+                self.normalize(self.ctrl_edm/self.treat_edm,
+                               self.ctrl_edm/self.treat_edm)
+            elif self.normalization_method == ord('C'):  # do nothing
+                pass
+            elif self.normalization_method == ord('M'):
+                self.normalize(self.ctrl_edm, self.ctrl_edm)
+            elif self.normalization_method == ord('N'):
+                self.normalize(self.ctrl_edm/self.treat_edm, 1)
+            else:
+                raise NotImplementedError
+            self.normalization_method = ord('C')
+        elif normalization_method == ord('M'):
+            if self.normalization_method == ord('T'):
+                self.normalize(1/self.treat_edm,
+                               1/self.treat_edm)
+            elif self.normalization_method == ord('C'):
+                self.normalize(1/self.ctrl_edm,
+                               1/self.ctrl_edm)
+            elif self.normalization_method == ord('M'):  # do nothing
+                pass
+            elif self.normalization_method == ord('N'):
+                self.normalize(1/self.treat_edm,
+                               1/self.ctrl_edm)
+            else:
+                raise NotImplementedError
+            self.normalization_method = ord('M')
+        elif normalization_method == ord('N'):
+            if self.normalization_method == ord('T'):
+                self.normalize(self.treat_edm,
+                               self.treat_edm)
+            elif self.normalization_method == ord('C'):
+                self.normalize(self.ctrl_edm,
+                               self.ctrl_edm)
+            elif self.normalization_method == ord('M'):
+                self.normalize(self.treat_edm,
+                               self.ctrl_edm)
+            elif self.normalization_method == ord('N'):  # do nothing
+                pass
+            else:
+                raise NotImplementedError
+            self.normalization_method = ord('N')
+
+    @cython.cfunc
+    def normalize(self,
+                  treat_scale: cython.float,
+                  control_scale: cython.float):
+        p: cnp.ndarray
+        c: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                p[i] *= treat_scale
+                c[i] *= control_scale
+        return
+
+    @cython.ccall
+    def change_score_method(self,
+                            scoring_method: cython.char):
+        """
+        scoring_method:  p: -log10 pvalue;
+                         q: -log10 qvalue;
+                         l: log10 likelihood ratio (minus for depletion)
+                         s: symmetric log10 likelihood ratio (for comparing two
+                            ChIPs)
+                         f: log10 fold enrichment
+                         F: linear fold enrichment
+                         d: subtraction
+                         M: maximum
+                         m: fragment pileup per million reads
+        """
+        if scoring_method == ord('p'):
+            self.compute_pvalue()
+        elif scoring_method == ord('q'):
+            # if not already calculated p, compute pvalue first
+            if self.scoring_method != ord('p'):
+                self.compute_pvalue()
+            self.compute_qvalue()
+        elif scoring_method == ord('l'):
+            self.compute_likelihood()
+        elif scoring_method == ord('s'):
+            self.compute_sym_likelihood()
+        elif scoring_method == ord('f'):
+            self.compute_logFE()
+        elif scoring_method == ord('F'):
+            self.compute_foldenrichment()
+        elif scoring_method == ord('d'):
+            self.compute_subtraction()
+        elif scoring_method == ord('m'):
+            self.compute_SPMR()
+        elif scoring_method == ord('M'):
+            self.compute_max()
+        else:
+            raise NotImplementedError
+
+    @cython.cfunc
+    def compute_pvalue(self):
+        """Compute -log_{10}(pvalue)
+        """
+        p: cnp.ndarray
+        c: cnp.ndarray
+        v: cnp.ndarray
+        pos: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+        prev_pos: cython.long
+        chrom: bytes
+
+        for chrom in sorted(self.data.keys()):
+            prev_pos = 0
+            pos = self.data[chrom][0]
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = get_pscore(cython.cast(cython.int,
+                                              (p[i] + self.pseudocount)),
+                                  c[i] + self.pseudocount)
+                try:
+                    self.pvalue_stat[v[i]] += pos[i] - prev_pos
+                except Exception:
+                    self.pvalue_stat[v[i]] = pos[i] - prev_pos
+                prev_pos = pos[i]
+
+        self.scoring_method = ord('p')
+        return
+
+    @cython.cfunc
+    def compute_qvalue(self):
+        """Compute -log_{10}(qvalue)
+        """
+        pqtable: object
+        i: cython.long
+        ln: cython.long
+        chrom: bytes
+        v: cnp.ndarray
+
+        # pvalue should be computed first!
+        assert self.scoring_method == ord('p')
+        # make pqtable
+        pqtable = self.make_pq_table()
+
+        # convert p to q
+        for chrom in sorted(self.data.keys()):
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = pqtable[v[i]]
+
+        self.scoring_method = ord('q')
+        return
+
+    @cython.ccall
+    def make_pq_table(self):
+        """Make pvalue-qvalue table.
+
+        Step1: get all pvalue and length of block with this pvalue
+        Step2: Sort them
+        Step3: Apply AFDR method to adjust pvalue and get qvalue for
+               each pvalue
+
+        Return a dictionary of
+        {-log10pvalue:(-log10qvalue,rank,basepairs)} relationships.
+
+        """
+        ln: cython.long
+        i: cython.long
+        j: cython.long
+        v: cython.float
+        q: cython.float
+        pre_q: cython.float     # store the p and q scores
+        N: cython.long
+        k: cython.float
+        f: cython.float
+        pvalue2qvalue: object
+        pvalue_stat: dict
+        unique_values: list
+
+        assert self.scoring_method == ord('p')
+
+        pvalue_stat = self.pvalue_stat
+
+        N = sum(pvalue_stat.values())
+        k = 1                           # rank
+        f = -log10(N)
+        pre_q = 2147483647              # save the previous q-value
+
+        pvalue2qvalue = Float32to32Map(for_int=False)
+        unique_values = sorted(list(pvalue_stat.keys()), reverse=True)
+        for i in range(len(unique_values)):
+            v = unique_values[i]
+            ln = pvalue_stat[v]
+            q = v + (log10(k) + f)
+            if q > pre_q:
+                q = pre_q
+            if q <= 0:
+                q = 0
+                break
+            pvalue2qvalue[v] = q
+            pre_q = q
+            k += ln
+        # bottom rank pscores all have qscores 0
+        for j in range(i, len(unique_values)):
+            v = unique_values[j]
+            pvalue2qvalue[v] = 0
+        return pvalue2qvalue
+
+    @cython.cfunc
+    def compute_likelihood(self):
+        """Calculate log10 likelihood.
+
+        """
+        ln: cython.long
+        i: cython.long
+        chrom: bytes
+        v1: cython.float
+        v2: cython.float
+        pseudocount: cython.float
+
+        pseudocount = self.pseudocount
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1].flat.__next__  # pileup in treatment
+            c = self.data[chrom][2].flat.__next__  # pileup in control
+            v = self.data[chrom][3]                # score
+            ln = self.datalength[chrom]
+            v1 = 2
+            v2 = 1
+            for i in range(ln):
+                v1 = p()
+                v2 = c()
+                v[i] = logLR_asym(v1 + pseudocount, v2 + pseudocount)
+        self.scoring_method = ord('l')
+        return
+
+    @cython.cfunc
+    def compute_sym_likelihood(self):
+        """Calculate symmetric log10 likelihood.
+
+        """
+        ln: cython.long
+        i: cython.long
+        chrom: bytes
+        v1: cython.float
+        v2: cython.float
+        pseudocount: cython.float
+
+        pseudocount = self.pseudocount
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1].flat.__next__
+            c = self.data[chrom][2].flat.__next__
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            v1 = 2
+            v2 = 1
+            for i in range(ln):
+                v1 = p()
+                v2 = c()
+                v[i] = logLR_sym(v1 + pseudocount, v2 + pseudocount)
+        self.scoring_method = ord('s')
+        return
+
+    @cython.cfunc
+    def compute_logFE(self):
+        """Calculate log10 fold enrichment (with 1 pseudocount).
+
+        """
+        p: cnp.ndarray
+        c: cnp.ndarray
+        v: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+        pseudocount: cython.float
+
+        pseudocount = self.pseudocount
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = get_logFE(p[i] + pseudocount, c[i] + pseudocount)
+        self.scoring_method = ord('f')
+        return
+
+    @cython.cfunc
+    def compute_foldenrichment(self):
+        """Calculate linear scale fold enrichment (with 1 pseudocount).
+
+        """
+        p: cnp.ndarray
+        c: cnp.ndarray
+        v: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+        pseudocount: cython.float
+
+        pseudocount = self.pseudocount
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = (p[i] + pseudocount)/(c[i] + pseudocount)
+        self.scoring_method = ord('F')
+        return
+
+    @cython.cfunc
+    def compute_subtraction(self):
+        p: cnp.ndarray
+        c: cnp.ndarray
+        v: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = p[i] - c[i]
+        self.scoring_method = ord('d')
+        return
+
+    @cython.cfunc
+    def compute_SPMR(self):
+        p: cnp.ndarray
+        v: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+        scale: cython.float
+
+        if self.normalization_method == ord('T') or self.normalization_method == ord('N'):
+            scale = self.treat_edm
+        elif self.normalization_method == ord('C'):
+            scale = self.ctrl_edm
+        elif self.normalization_method == ord('M'):
+            scale = 1
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = p[i] / scale  # two digit precision may not be enough...
+        self.scoring_method = ord('m')
+        return
+
+    @cython.cfunc
+    def compute_max(self):
+        p: cnp.ndarray
+        c: cnp.ndarray
+        v: cnp.ndarray
+        ln: cython.long
+        i: cython.long
+
+        for chrom in sorted(self.data.keys()):
+            p = self.data[chrom][1]
+            c = self.data[chrom][2]
+            v = self.data[chrom][3]
+            ln = self.datalength[chrom]
+            for i in range(ln):
+                v[i] = max(p[i], c[i])
+        self.scoring_method = ord('M')
+        return
+
+    @cython.ccall
+    def write_bedGraph(self,
+                       fhd,
+                       name: str,
+                       description: str,
+                       column: cython.short = 3):
+        """Write all data to fhd in bedGraph Format.
+
+        fhd: a filehandler to save bedGraph.
+
+        name/description: the name and description in track line.
+
+        colname: can be 1: chip, 2: control, 3: score
+
+        """
+        chrom: bytes
+        ln: cython.int
+        pre: cython.int
+        i: cython.int
+        p: cython.int
+        pre_v: cython.float
+        v: cython.float
+        chrs: set
+        pos: cnp.ndarray
+        value: cnp.ndarray
+
+        assert column in range(1, 4), "column should be between 1, 2 or 3."
+
+        write = fhd.write
+
+        if self.trackline:
+            # this line is REQUIRED by the wiggle format for UCSC browser
+            write("track type=bedGraph name=\"%s\" description=\"%s\"\n" %
+                  (name.decode(), description))
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            pos = self.data[chrom][0]
+            value = self.data[chrom][column]
+            ln = self.datalength[chrom]
+            pre = 0
+            if pos.shape[0] == 0:
+                continue  # skip if there's no data
+            pre_v = value[0]
+            for i in range(1, ln):
+                v = value[i]
+                p = pos[i-1]
+                if abs(pre_v - v) > 1e-5:  # precision is 5 digits
+                    write("%s\t%d\t%d\t%.5f\n" %
+                          (chrom.decode(), pre, p, pre_v))
+                    pre_v = v
+                    pre = p
+            p = pos[-1]
+            # last one
+            write("%s\t%d\t%d\t%.5f\n" %
+                  (chrom.decode(), pre, p, pre_v))
+
+        return True
+
+    @cython.ccall
+    def call_peaks(self,
+                   cutoff: cython.float = 5.0,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 50,
+                   call_summits: bool = False):
+        """This function try to find regions within which, scores
+        are continuously higher than a given cutoff.
+
+        This function is NOT using sliding-windows. Instead, any
+        regions in bedGraph above certain cutoff will be detected,
+        then merged if the gap between nearby two regions are below
+        max_gap. After this, peak is reported if its length is above
+        min_length.
+
+        cutoff:  cutoff of value, default 5. For -log10pvalue, it means 10^-5.
+        min_length :  minimum peak length, default 200.
+        max_gap   :  maximum gap to merge nearby peaks, default 50.
+        call_summits: whether or not to call all summits (local maxima).
+        """
+        i: cython.int
+        chrom: bytes
+        pos: cnp.ndarray
+        sample: cnp.ndarray
+        control: cnp.ndarray
+        value: cnp.ndarray
+        above_cutoff: cnp.ndarray
+        above_cutoff_v: cnp.ndarray
+        above_cutoff_endpos: cnp.ndarray
+        above_cutoff_startpos: cnp.ndarray
+        above_cutoff_sv: cnp.ndarray
+        peak_content: list
+
+        chrs = self.get_chr_names()
+        peaks = PeakIO()                      # dictionary to save peaks
+
+        self.cutoff = cutoff
+        for chrom in sorted(chrs):
+            peak_content = []           # to store points above cutoff
+
+            pos = self.data[chrom][0]
+            sample = self.data[chrom][1]
+            # control = self.data[chrom][2]
+            value = self.data[chrom][3]
+
+            # indices where score is above cutoff
+            above_cutoff = np.nonzero(value >= cutoff)[0]
+            # scores where score is above cutoff
+            above_cutoff_v = value[above_cutoff]
+            # end positions of regions where score is above cutoff
+            above_cutoff_endpos = pos[above_cutoff]
+            # start positions of regions where score is above cutoff
+            above_cutoff_startpos = pos[above_cutoff-1]
+            # sample pileup height where score is above cutoff
+            above_cutoff_sv = sample[above_cutoff]
+            if above_cutoff_v.size == 0:
+                # nothing above cutoff
+                continue
+
+            if above_cutoff[0] == 0:
+                # first element > cutoff, fix the first point as
+                # 0. otherwise it would be the last item in
+                # data[chrom]['pos']
+                above_cutoff_startpos[0] = 0
+
+            # first bit of region above cutoff
+            peak_content.append((above_cutoff_startpos[0],
+                                 above_cutoff_endpos[0],
+                                 above_cutoff_v[0],
+                                 above_cutoff_sv[0],
+                                 above_cutoff[0]))
+            for i in range(1, above_cutoff_startpos.size):
+                if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap:
+                    # append
+                    peak_content.append((above_cutoff_startpos[i],
+                                         above_cutoff_endpos[i],
+                                         above_cutoff_v[i],
+                                         above_cutoff_sv[i],
+                                         above_cutoff[i]))
+                else:
+                    # close
+                    if call_summits:
+                        self.__close_peak2(peak_content,
+                                           peaks,
+                                           min_length,
+                                           chrom,
+                                           max_gap//2)
+                    else:
+                        self.__close_peak(peak_content,
+                                          peaks,
+                                          min_length,
+                                          chrom)
+                    peak_content = [(above_cutoff_startpos[i],
+                                     above_cutoff_endpos[i],
+                                     above_cutoff_v[i],
+                                     above_cutoff_sv[i],
+                                     above_cutoff[i]),]
+
+            # save the last peak
+            if not peak_content:
+                continue
+            else:
+                if call_summits:
+                    self.__close_peak2(peak_content,
+                                       peaks,
+                                       min_length,
+                                       chrom,
+                                       max_gap//2)
+                else:
+                    self.__close_peak(peak_content,
+                                      peaks,
+                                      min_length,
+                                      chrom)
+
+        return peaks
+
+    @cython.cfunc
+    def __close_peak(self,
+                     peak_content: list,
+                     peaks: object,
+                     min_length: cython.int,
+                     chrom: bytes) -> bool:
+        """Close the peak region, output peak boundaries, peak summit
+        and scores, then add the peak to peakIO object.
+
+        In this function, we define the peak summit as the middle
+        point of the region with the highest score, in this peak. For
+        example, if the region of the highest score is from 100 to
+        200, the summit is 150. If there are several regions of the
+        same 'highest score', we will first calculate the possible
+        summit for each such region, then pick a position close to the
+        middle index (= (len(highest_regions) + 1) / 2) of these
+        summits. For example, if there are three regions with the same
+        highest scores, [100,200], [300,400], [600,700], we will first
+        find the possible summits as 150, 350, and 650, and then pick
+        the middle index, the 2nd, of the three positions -- 350 as
+        the final summit. If there are four regions, we pick the 2nd
+        as well.
+
+        peaks: a PeakIO object
+
+        """
+        summit_pos: cython.int
+        tstart: cython.int
+        tend: cython.int
+        summit_index: cython.int
+        i: cython.int
+        midindex: cython.int
+        summit_value: cython.float
+        tvalue: cython.float
+        tsummitvalue: cython.float
+
+        peak_length = peak_content[-1][1] - peak_content[0][0]
+        if peak_length >= min_length:  # if the peak is too small, reject it
+            tsummit = []
+            summit_pos = 0
+            summit_value = 0
+            for i in range(len(peak_content)):
+                (tstart, tend, tvalue, tsummitvalue, tindex) = peak_content[i]
+                #for (tstart,tend,tvalue,tsummitvalue, tindex) in peak_content:
+                if not summit_value or summit_value < tsummitvalue:
+                    tsummit = [(tend + tstart) / 2,]
+                    tsummit_index = [tindex,]
+                    summit_value = tsummitvalue
+                elif summit_value == tsummitvalue:
+                    # remember continuous summit values
+                    tsummit.append(int((tend + tstart) / 2))
+                    tsummit_index.append(tindex)
+            # the middle of all highest points in peak region is defined as summit
+            midindex = int((len(tsummit) + 1) / 2) - 1
+            summit_pos = tsummit[midindex]
+            summit_index = tsummit_index[midindex]
+            if self.scoring_method == ord('q'):
+                qscore = self.data[chrom][3][summit_index]
+            else:
+                # if q value is not computed, use -1
+                qscore = -1
+
+            peaks.add(chrom,
+                      peak_content[0][0],
+                      peak_content[-1][1],
+                      summit=summit_pos,
+                      peak_score=self.data[chrom][3][summit_index],
+                      # should be the same as summit_value
+                      pileup=self.data[chrom][1][summit_index],  
+                      pscore=get_pscore(self.data[chrom][1][summit_index],
+                                        self.data[chrom][2][summit_index]),
+                      fold_change=(self.data[chrom][1][summit_index] +
+                                   self.pseudocount) / (self.data[chrom][2][summit_index] +
+                                                        self.pseudocount),
+                      qscore=qscore,
+                      )
+            # start a new peak
+            return True
+
+    @cython.cfunc
+    def __close_peak2(self,
+                      peak_content: list,
+                      peaks: object,
+                      min_length: cython.int,
+                      chrom: bytes,
+                      smoothlen: cython.int = 51,
+                      min_valley: cython.float = 0.9) -> bool:
+        """Close the peak region, output peak boundaries, peak summit
+        and scores, then add the peak to peakIO object.
+
+        In this function, we use signal processing methods to smooth
+        the scores in the peak region, find the maxima and enforce the
+        peaky shape, and to define the best maxima as the peak
+        summit. The functions used for signal processing is 'maxima'
+        (with 2nd order polynomial filter) and 'enfoce_peakyness'
+        functions in SignalProcessing.pyx.
+
+        peaks: a PeakIO object
+
+        """
+        tstart: cython.int
+        tend: cython.int
+        tmpindex: cython.int
+        summit_index: cython.int
+        summit_offset: cython.int
+        start: cython.int
+        end: cython.int
+        i: cython.int
+        j: cython.int
+        start_boundary: cython.int
+        tvalue: cython.float
+        peakdata: cnp.ndarray(cython.float, ndim=1)
+        peakindices: cnp.ndarray(cython.int, ndim=1)
+        summit_offsets: cnp.ndarray(cython.int, ndim=1)
+
+        # Add 10 bp padding to peak region so that we can get true minima
+        end = peak_content[-1][1] + 10
+        start = peak_content[0][0] - 10
+        if start < 0:
+            start_boundary = 10 + start
+            start = 0
+        else:
+            start_boundary = 10
+        peak_length = end - start
+        if end - start < min_length:
+            return             # if the region is too small, reject it
+
+        peakdata = np.zeros(end - start, dtype='f4')
+        peakindices = np.zeros(end - start, dtype='i4')
+        for (tstart, tend, tvalue, tsvalue, tmpindex) in peak_content:
+            i = tstart - start + start_boundary
+            j = tend - start + start_boundary
+            peakdata[i:j] = tsvalue
+            peakindices[i:j] = tmpindex
+        summit_offsets = maxima(peakdata, smoothlen)
+        if summit_offsets.shape[0] == 0:
+            # **failsafe** if no summits, fall back on old approach #
+            return self.__close_peak(peak_content, peaks, min_length, chrom)
+        else:
+            # remove maxima that occurred in padding
+            i = np.searchsorted(summit_offsets,
+                                start_boundary)
+            j = np.searchsorted(summit_offsets,
+                                peak_length + start_boundary,
+                                'right')
+            summit_offsets = summit_offsets[i:j]
+
+        summit_offsets = enforce_peakyness(peakdata, summit_offsets)
+        if summit_offsets.shape[0] == 0:
+            # **failsafe** if no summits, fall back on old approach #
+            return self.__close_peak(peak_content, peaks, min_length, chrom)
+
+        summit_indices = peakindices[summit_offsets]
+        summit_offsets -= start_boundary
+
+        peak_scores = self.data[chrom][3][summit_indices]
+        if not (peak_scores > self.cutoff).all():
+            return self.__close_peak(peak_content, peaks, min_length, chrom)
+        for summit_offset, summit_index in zip(summit_offsets, summit_indices):
+            if self.scoring_method == ord('q'):
+                qscore = self.data[chrom][3][summit_index]
+            else:
+                # if q value is not computed, use -1
+                qscore = -1
+            peaks.add(chrom,
+                      start,
+                      end,
+                      summit=start + summit_offset,
+                      peak_score=self.data[chrom][3][summit_index],
+                      # should be the same as summit_value
+                      pileup=self.data[chrom][1][summit_index],
+                      pscore=get_pscore(self.data[chrom][1][summit_index],
+                                        self.data[chrom][2][summit_index]),
+                      fold_change=(self.data[chrom][1][summit_index] +
+                                   self.pseudocount) / (self.data[chrom][2][summit_index] +
+                                                        self.pseudocount),
+                      qscore=qscore,
+                      )
+        # start a new peak
+        return True
+
+    @cython.cfunc
+    def total(self) -> cython.long:
+        """Return the number of regions in this object.
+
+        """
+        t: cython.long
+        chrom: bytes
+
+        t = 0
+        for chrom in sorted(self.data.keys()):
+            t += self.datalength[chrom]
+        return t
+
+    @cython.ccall
+    def call_broadpeaks(self,
+                        lvl1_cutoff: cython.float = 5.0,
+                        lvl2_cutoff: cython.float = 1.0,
+                        min_length: cython.int = 200,
+                        lvl1_max_gap: cython.int = 50,
+                        lvl2_max_gap: cython.int = 400):
+        """This function try to find enriched regions within which,
+        scores are continuously higher than a given cutoff for level
+        1, and link them using the gap above level 2 cutoff with a
+        maximum length of lvl2_max_gap.
+
+        lvl1_cutoff:  cutoff of value at enriched regions, default 5.0.
+        lvl2_cutoff:  cutoff of value at linkage regions, default 1.0.
+        min_length :  minimum peak length, default 200.
+        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
+        lvl2_max_gap   :  maximum length of linkage regions, default 400.
+
+        Return both general PeakIO object for highly enriched regions
+        and gapped broad regions in BroadPeakIO.
+        """
+        i: cython.int
+        chrom: bytes
+
+        assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
+        assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
+        lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl1_max_gap)
+        lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff,
+                                     min_length=min_length,
+                                     max_gap=lvl2_max_gap)
+        chrs = lvl1_peaks.peaks.keys()
+        broadpeaks = BroadPeakIO()
+        # use lvl2_peaks as linking regions between lvl1_peaks
+        for chrom in sorted(chrs):
+            lvl1peakschrom = lvl1_peaks.peaks[chrom]
+            lvl2peakschrom = lvl2_peaks.peaks[chrom]
+            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
+            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
+            # our assumption is lvl1 regions should be included in lvl2 regions
+            try:
+                lvl1 = lvl1peakschrom_next()
+                for i in range(len(lvl2peakschrom)):
+                    # for each lvl2 peak, find all lvl1 peaks inside
+                    # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
+                    lvl2 = lvl2peakschrom[i]
+
+                    while True:
+                        if lvl2["start"] <= lvl1["start"] and lvl1["end"] <= lvl2["end"]:
+                            tmppeakset.append(lvl1)
+                            lvl1 = lvl1peakschrom_next()
+                        else:
+                            # make a hierarchical broad peak
+                            #print lvl2["start"], lvl2["end"], lvl2["score"]
+                            self.__add_broadpeak(broadpeaks,
+                                                 chrom,
+                                                 lvl2,
+                                                 tmppeakset)
+                            tmppeakset = []
+                            break
+            except StopIteration:
+                # no more strong (aka lvl1) peaks left
+                self.__add_broadpeak(broadpeaks,
+                                     chrom,
+                                     lvl2,
+                                     tmppeakset)
+                tmppeakset = []
+                # add the rest lvl2 peaks
+                for j in range(i+1, len(lvl2peakschrom)):
+                    self.__add_broadpeak(broadpeaks,
+                                         chrom,
+                                         lvl2peakschrom[j],
+                                         tmppeakset)
+
+        return broadpeaks
+
+    def __add_broadpeak(self,
+                        bpeaks,
+                        chrom: bytes,
+                        lvl2peak: dict,
+                        lvl1peakset: list):
+        """Internal function to create broad peak.
+        """
+
+        blockNum: cython.int
+        thickStart: cython.int
+        thickEnd: cython.int
+        start: cython.int
+        end: cython.int
+        blockSizes: bytes
+        blockStarts: bytes
+
+        start = lvl2peak["start"]
+        end = lvl2peak["end"]
+
+        # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
+        if not lvl1peakset:
+            # will complement by adding 1bps start and end to this region
+            # may change in the future if gappedPeak format was improved.
+            bpeaks.add(chrom,
+                       start,
+                       end,
+                       score=lvl2peak["score"],
+                       thickStart=(b"%d" % start),
+                       thickEnd=(b"%d" % end),
+                       blockNum=2,
+                       blockSizes=b"1,1",
+                       blockStarts=(b"0,%d" % (end-start-1)),
+                       pileup=lvl2peak["pileup"],
+                       pscore=lvl2peak["pscore"],
+                       fold_change=lvl2peak["fc"],
+                       qscore=lvl2peak["qscore"])
+            return bpeaks
+
+        thickStart = b"%d" % lvl1peakset[0]["start"]
+        thickEnd = b"%d" % lvl1peakset[-1]["end"]
+        blockNum = int(len(lvl1peakset))
+        blockSizes = b",".join([b"%d" % x["length"] for x in lvl1peakset])
+        blockStarts = b",".join([b"%d" % (x["start"]-start) for x in lvl1peakset])
+
+        if lvl2peak["start"] != thickStart:
+            # add 1bp mark for the start of lvl2 peak
+            thickStart = b"%d" % start
+            blockNum += 1
+            blockSizes = b"1,"+blockSizes
+            blockStarts = b"0,"+blockStarts
+        if lvl2peak["end"] != thickEnd:
+            # add 1bp mark for the end of lvl2 peak
+            thickEnd = b"%d" % end
+            blockNum += 1
+            blockSizes = blockSizes+b",1"
+            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
+
+        # add to BroadPeakIO object
+        bpeaks.add(chrom,
+                   start,
+                   end,
+                   score=lvl2peak["score"],
+                   thickStart=thickStart,
+                   thickEnd=thickEnd,
+                   blockNum=blockNum,
+                   blockSizes=blockSizes,
+                   blockStarts=blockStarts,
+                   pileup=lvl2peak["pileup"],
+                   pscore=lvl2peak["pscore"],
+                   fold_change=lvl2peak["fc"],
+                   qscore=lvl2peak["qscore"])
+        return bpeaks
+
+@cython.cclass
+class TwoConditionScores:
+    """Class for saving two condition comparison scores.
+    """
+    # dictionary for data of each chromosome
+    data: dict
+    # length of data array of each chromosome
+    datalength: dict
+    # factor to apply to cond1 pileup values
+    cond1_factor: cython.float
+    # factor to apply to cond2 pileup values
+    cond2_factor: cython.float
+    # the pseudocount used to calcuate LLR
+    pseudocount: cython.float
+    cutoff: cython.float
+    t1bdg: object
+    c1bdg: object
+    t2bdg: object
+    c2bdg: object
+    pvalue_stat1: dict
+    pvalue_stat2: dict
+    pvalue_stat3: dict
+
+    def __init__(self,
+                 t1bdg,
+                 c1bdg,
+                 t2bdg,
+                 c2bdg,
+                 cond1_factor: cython.float = 1.0,
+                 cond2_factor: cython.float = 1.0,
+                 pseudocount: cython.float = 0.01,
+                 proportion_background_empirical_distribution: cython.float = 0.99999):
+        """t1bdg: a bedGraphTrackI object for treat 1
+        c1bdg: a bedGraphTrackI object for control 1
+        t2bdg: a bedGraphTrackI object for treat 2
+        c2bdg: a bedGraphTrackI object for control 2
+
+        cond1_factor: this will be multiplied to values in t1bdg and c1bdg
+        cond2_factor: this will be multiplied to values in t2bdg and c2bdg
+
+        pseudocount: pseudocount, by default 0.01.
+
+        proportion_background_empirical_distribution: proportion of
+        genome as the background to build empirical distribution
+
+        """
+        # for each chromosome, there is a l*4 matrix. First column: end
+        # position of a region; Second: treatment pileup; third:
+        # control pileup ; forth: score (can be p/q-value/likelihood
+        # ratio/fold-enrichment/subtraction depending on -c setting)
+        self.data = {}
+        self.datalength = {}
+        self.cond1_factor = cond1_factor
+        self.cond2_factor = cond2_factor
+        self.pseudocount = pseudocount
+        self.pvalue_stat1 = {}
+        self.pvalue_stat2 = {}
+        self.t1bdg = t1bdg
+        self.c1bdg = c1bdg
+        self.t2bdg = t2bdg
+        self.c2bdg = c2bdg
+        # self.empirical_distr_llr = [] # save all values in histogram
+
+    @cython.ccall
+    def set_pseudocount(self, pseudocount: cython.float):
+        self.pseudocount = pseudocount
+
+    @cython.ccall
+    def build(self):
+        """Compute scores from 3 types of comparisons and store them
+        in self.data.
+
+        """
+        common_chrs: set
+        chrname: bytes
+        chrom_max_len: cython.int
+        # common chromosome names
+        common_chrs = self.get_common_chrs()
+        for chrname in common_chrs:
+            (cond1_treat_ps, cond1_treat_vs) = self.t1bdg.get_data_by_chr(chrname)
+            (cond1_control_ps, cond1_control_vs) = self.c1bdg.get_data_by_chr(chrname)
+            (cond2_treat_ps, cond2_treat_vs) = self.t2bdg.get_data_by_chr(chrname)
+            (cond2_control_ps, cond2_control_vs) = self.c2bdg.get_data_by_chr(chrname)
+            chrom_max_len = len(cond1_treat_ps) + len(cond1_control_ps) + len(cond2_treat_ps) + len(cond2_control_ps)
+            self.add_chromosome(chrname, chrom_max_len)
+            self.build_chromosome(chrname,
+                                  cond1_treat_ps, cond1_control_ps,
+                                  cond2_treat_ps, cond2_control_ps,
+                                  cond1_treat_vs, cond1_control_vs,
+                                  cond2_treat_vs, cond2_control_vs)
+
+    @cython.cfunc
+    def build_chromosome(self, chrname,
+                         cond1_treat_ps, cond1_control_ps,
+                         cond2_treat_ps, cond2_control_ps,
+                         cond1_treat_vs, cond1_control_vs,
+                         cond2_treat_vs, cond2_control_vs):
+        """Internal function to calculate scores for three types of comparisons.
+
+        cond1_treat_ps, cond1_control_ps: position of treat and control of condition 1
+        cond2_treat_ps, cond2_control_ps: position of treat and control of condition 2
+        cond1_treat_vs, cond1_control_vs: value of treat and control of condition 1
+        cond2_treat_vs, cond2_control_vs: value of treat and control of condition 2
+
+        """
+        c1tp: cython.int
+        c1cp: cython.int
+        c2tp: cython.int
+        c2cp: cython.int
+        minp: cython.int
+        pre_p: cython.int
+        c1tv: cython.float
+        c1cv: cython.float
+        c2tv: cython.float
+        c2cv: cython.float
+
+        c1tpn = iter(cond1_treat_ps).__next__
+        c1cpn = iter(cond1_control_ps).__next__
+        c2tpn = iter(cond2_treat_ps).__next__
+        c2cpn = iter(cond2_control_ps).__next__
+        c1tvn = iter(cond1_treat_vs).__next__
+        c1cvn = iter(cond1_control_vs).__next__
+        c2tvn = iter(cond2_treat_vs).__next__
+        c2cvn = iter(cond2_control_vs).__next__
+
+        pre_p = 0
+
+        try:
+            c1tp = c1tpn()
+            c1tv = c1tvn()
+
+            c1cp = c1cpn()
+            c1cv = c1cvn()
+
+            c2tp = c2tpn()
+            c2tv = c2tvn()
+
+            c2cp = c2cpn()
+            c2cv = c2cvn()
+
+            while True:
+                minp = min(c1tp, c1cp, c2tp, c2cp)
+                self.add(chrname, pre_p, c1tv, c1cv, c2tv, c2cv)
+                pre_p = minp
+                if c1tp == minp:
+                    c1tp = c1tpn()
+                    c1tv = c1tvn()
+                if c1cp == minp:
+                    c1cp = c1cpn()
+                    c1cv = c1cvn()
+                if c2tp == minp:
+                    c2tp = c2tpn()
+                    c2tv = c2tvn()
+                if c2cp == minp:
+                    c2cp = c2cpn()
+                    c2cv = c2cvn()
+        except StopIteration:
+            # meet the end of either bedGraphTrackI, simply exit
+            pass
+        return
+
+    @cython.cfunc
+    def get_common_chrs(self) -> set:
+        t1chrs: set
+        c1chrs: set
+        t2chrs: set
+        c2chrs: set
+        common: set
+        t1chrs = self.t1bdg.get_chr_names()
+        c1chrs = self.c1bdg.get_chr_names()
+        t2chrs = self.t2bdg.get_chr_names()
+        c2chrs = self.c2bdg.get_chr_names()
+        common = reduce(lambda x, y: x.intersection(y),
+                        (t1chrs, c1chrs, t2chrs, c2chrs))
+        return common
+
+    @cython.cfunc
+    def add_chromosome(self,
+                       chrom: bytes,
+                       chrom_max_len: cython.int):
+        """
+        chrom: chromosome name
+        chrom_max_len: maximum number of data points in this chromosome
+
+        """
+        if chrom not in self.data:
+            self.data[chrom] = [np.zeros(chrom_max_len, dtype="i4"),  # pos
+                                np.zeros(chrom_max_len, dtype="f4"),  # LLR t1 vs c1
+                                np.zeros(chrom_max_len, dtype="f4"),  # LLR t2 vs c2
+                                np.zeros(chrom_max_len, dtype="f4")]  # LLR t1 vs t2
+            self.datalength[chrom] = 0
+
+    @cython.cfunc
+    def add(self,
+            chromosome: bytes,
+            endpos: cython.int,
+            t1: cython.float,
+            c1: cython.float,
+            t2: cython.float,
+            c2: cython.float):
+        """Take chr-endpos-sample1-control1-sample2-control2 and
+        compute logLR for t1 vs c1, t2 vs c2, and t1 vs t2, then save
+        values.
+
+        chromosome: chromosome name in string
+        endpos    : end position of each interval in integer
+        t1        : Sample 1 ChIP pileup value of each interval in float
+        c1        : Sample 1 Control pileup value of each interval in float
+        t2        : Sample 2 ChIP pileup value of each interval in float
+        c2        : Sample 2 Control pileup value of each interval in float
+
+        *Warning* Need to add regions continuously.
+        """
+        i: cython.int
+        c: list
+
+        i = self.datalength[chromosome]
+        c = self.data[chromosome]
+        c[0][i] = endpos
+        c[1][i] = logLR_asym((t1+self.pseudocount) * self.cond1_factor,
+                             (c1+self.pseudocount) * self.cond1_factor)
+        c[2][i] = logLR_asym((t2+self.pseudocount) * self.cond2_factor,
+                             (c2+self.pseudocount) * self.cond2_factor)
+        c[3][i] = logLR_sym((t1+self.pseudocount) * self.cond1_factor,
+                            (t2+self.pseudocount) * self.cond2_factor)
+        self.datalength[chromosome] += 1
+        return
+
+    @cython.ccall
+    def finalize(self):
+        """
+        Adjust array size of each chromosome.
+
+        """
+        chrom: bytes
+        ln: cython.int
+        d: list
+
+        for chrom in sorted(self.data.keys()):
+            d = self.data[chrom]
+            ln = self.datalength[chrom]
+            d[0].resize(ln, refcheck=False)
+            d[1].resize(ln, refcheck=False)
+            d[2].resize(ln, refcheck=False)
+            d[3].resize(ln, refcheck=False)
+        return
+
+    @cython.ccall
+    def get_data_by_chr(self,
+                        chromosome: bytes):
+        """Return array of counts by chromosome.
+
+        The return value is a tuple:
+        ([end pos],[value])
+        """
+        if chromosome in self.data:
+            return self.data[chromosome]
+        else:
+            return None
+
+    @cython.ccall
+    def get_chr_names(self):
+        """Return all the chromosome names stored.
+
+        """
+        return set(self.data.keys())
+
+    @cython.ccall
+    def write_bedGraph(self,
+                       fhd,
+                       name: str,
+                       description: str,
+                       column: cython.int = 3):
+        """Write all data to fhd in bedGraph Format.
+
+        fhd: a filehandler to save bedGraph.
+
+        name/description: the name and description in track line.
+
+        colname: can be 1: cond1 chip vs cond1 ctrl, 2: cond2 chip vs
+        cond2 ctrl, 3: cond1 chip vs cond2 chip
+
+        """
+        chrom: bytes
+        ln: cython.int
+        pre: cython.int
+        i: cython.int
+        p: cython.int
+        pre_v: cython.float
+        v: cython.float
+        pos: cnp.ndarray
+        value: cnp.ndarray
+
+        assert column in range(1, 4), "column should be between 1, 2 or 3."
+
+        write = fhd.write
+
+        # if self.trackline:
+        #    # this line is REQUIRED by the wiggle format for UCSC browser
+        #    write("track type=bedGraph name=\"%s\" description=\"%s\"\n" % (name.decode(), description))
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            pos = self.data[chrom][0]
+            value = self.data[chrom][column]
+            ln = self.datalength[chrom]
+            pre = 0
+            if pos.shape[0] == 0:
+                continue        # skip if there's no data
+            pre_v = value[0]
+            for i in range(1, ln):
+                v = value[i]
+                p = pos[i-1]
+                if abs(pre_v - v) >= 1e-6:
+                    write("%s\t%d\t%d\t%.5f\n" %
+                          (chrom.decode(), pre, p, pre_v))
+                    pre_v = v
+                    pre = p
+            p = pos[-1]
+            # last one
+            write("%s\t%d\t%d\t%.5f\n" % (chrom.decode(), pre, p, pre_v))
+
+        return True
+
+    @cython.ccall
+    def write_matrix(self,
+                     fhd,
+                     name: str,
+                     description: str):
+        """Write all data to fhd into five columns Format:
+
+        col1: chr_start_end
+        col2: t1 vs c1
+        col3: t2 vs c2
+        col4: t1 vs t2
+
+        fhd: a filehandler to save the matrix.
+
+        """
+        chrom: bytes
+        ln: cython.int
+        pre: cython.int
+        i: cython.int
+        p: cython.int
+        v1: cython.float
+        v2: cython.float
+        v3: cython.float
+        pos: cnp.ndarray
+        value1: cnp.ndarray
+        value2: cnp.ndarray
+        value3: cnp.ndarray
+
+        write = fhd.write
+
+        chrs = self.get_chr_names()
+        for chrom in sorted(chrs):
+            [pos, value1, value2, value3] = self.data[chrom]
+            ln = self.datalength[chrom]
+            pre = 0
+            if pos.shape[0] == 0:
+                continue        # skip if there's no data
+            for i in range(0, ln):
+                v1 = value1[i]
+                v2 = value2[i]
+                v3 = value3[i]
+                p = pos[i]
+                write("%s:%d_%d\t%.5f\t%.5f\t%.5f\n" %
+                      (chrom.decode(), pre, p, v1, v2, v3))
+                pre = p
+
+        return True
+
+    @cython.ccall
+    def call_peaks(self,
+                   cutoff: cython.float = 3,
+                   min_length: cython.int = 200,
+                   max_gap: cython.int = 100,
+                   call_summits: bool = False) -> tuple:
+        """This function try to find regions within which, scores
+        are continuously higher than a given cutoff.
+
+        For bdgdiff.
+
+        This function is NOT using sliding-windows. Instead, any
+        regions in bedGraph above certain cutoff will be detected,
+        then merged if the gap between nearby two regions are below
+        max_gap. After this, peak is reported if its length is above
+        min_length.
+
+        cutoff:  cutoff of value, default 3. For log10 LR, it means 1000 or -1000.
+        min_length :  minimum peak length, default 200.
+        max_gap   :  maximum gap to merge nearby peaks, default 100.
+        ptrack:  an optional track for pileup heights. If it's not None, use it to find summits. Otherwise, use self/scoreTrack.
+        """
+        chrom: bytes
+        pos: cnp.ndarray
+        t1_vs_c1: cnp.ndarray
+        t2_vs_c2: cnp.ndarray
+        t1_vs_t2: cnp.ndarray
+        cond1_over_cond2: cnp.ndarray
+        cond2_over_cond1: cnp.ndarray
+        cond1_equal_cond2: cnp.ndarray
+        cond1_sig: cnp.ndarray
+        cond2_sig: cnp.ndarray
+        cat1: cnp.ndarray
+        cat2: cnp.ndarray
+        cat3: cnp.ndarray
+        cat1_startpos: cnp.ndarray
+        cat1_endpos: cnp.ndarray
+        cat2_startpos: cnp.ndarray
+        cat2_endpos: cnp.ndarray
+        cat3_startpos: cnp.ndarray
+        cat3_endpos: cnp.ndarray
+
+        chrs = self.get_chr_names()
+        cat1_peaks = PeakIO()       # dictionary to save peaks significant at condition 1
+        cat2_peaks = PeakIO()       # dictionary to save peaks significant at condition 2
+        cat3_peaks = PeakIO()       # dictionary to save peaks significant in both conditions
+
+        self.cutoff = cutoff
+
+        for chrom in sorted(chrs):
+            pos = self.data[chrom][0]
+            t1_vs_c1 = self.data[chrom][1]
+            t2_vs_c2 = self.data[chrom][2]
+            t1_vs_t2 = self.data[chrom][3]
+            and_ = np.logical_and
+            # regions with stronger cond1 signals
+            cond1_over_cond2 = t1_vs_t2 >= cutoff
+            # regions with stronger cond2 signals
+            cond2_over_cond1 = t1_vs_t2 <= -1*cutoff
+            cond1_equal_cond2 = and_(t1_vs_t2 >= -1*cutoff, t1_vs_t2 <= cutoff)
+            # enriched regions in condition 1
+            cond1_sig = t1_vs_c1 >= cutoff
+            # enriched regions in condition 2
+            cond2_sig = t2_vs_c2 >= cutoff
+            # indices where score is above cutoff
+            # cond1 stronger than cond2, the indices
+            cat1 = np.where(and_(cond1_sig, cond1_over_cond2))[0]
+            # cond2 stronger than cond1, the indices
+            cat2 = np.where(and_(cond2_over_cond1, cond2_sig))[0]
+            # cond1 and cond2 are equal, the indices
+            cat3 = np.where(and_(and_(cond1_sig, cond2_sig),
+                                 cond1_equal_cond2))[0]
+
+            # end positions of regions where score is above cutoff
+            cat1_endpos = pos[cat1]
+            # start positions of regions where score is above cutoff            
+            cat1_startpos = pos[cat1-1]
+            # end positions of regions where score is above cutoff
+            cat2_endpos = pos[cat2]
+            # start positions of regions where score is above cutoff
+            cat2_startpos = pos[cat2-1]
+            # end positions of regions where score is above cutoff
+            cat3_endpos = pos[cat3]
+            # start positions of regions where score is above cutoff
+            cat3_startpos = pos[cat3-1]
+
+            # for cat1: condition 1 stronger regions
+            self.__add_a_peak(cat1_peaks,
+                              chrom,
+                              cat1,
+                              cat1_startpos,
+                              cat1_endpos,
+                              t1_vs_t2,
+                              max_gap,
+                              min_length)
+            # for cat2: condition 2 stronger regions
+            self.__add_a_peak(cat2_peaks,
+                              chrom,
+                              cat2,
+                              cat2_startpos,
+                              cat2_endpos,
+                              -1 * t1_vs_t2,
+                              max_gap,
+                              min_length)
+            # for cat3: commonly strong regions
+            self.__add_a_peak(cat3_peaks,
+                              chrom,
+                              cat3,
+                              cat3_startpos,
+                              cat3_endpos,
+                              abs(t1_vs_t2),
+                              max_gap,
+                              min_length)
+
+        return (cat1_peaks, cat2_peaks, cat3_peaks)
+
+    @cython.cfunc
+    def __add_a_peak(self,
+                     peaks: object,
+                     chrom: bytes,
+                     indices: cnp.ndarray,
+                     startpos: cnp.ndarray,
+                     endpos: cnp.ndarray,
+                     score: cnp.ndarray,
+                     max_gap: cython.int,
+                     min_length: cython.int):
+        
+        """For a given chromosome, merge nearby significant regions,
+        filter out smaller regions, then add regions to PeakIO
+        object.
+
+        """
+        i: cython.int
+        peak_content: list
+        mean_logLR: cython.float
+
+        if startpos.size > 0:
+            # if it is not empty
+            peak_content = []
+            if indices[0] == 0:
+                # first element > cutoff, fix the first point as
+                # 0. otherwise it would be the last item in
+                # data[chrom]['pos']
+                startpos[0] = 0
+            # first bit of region above cutoff
+            peak_content.append((startpos[0],
+                                 endpos[0],
+                                 score[indices[0]]))
+            for i in range(1, startpos.size):
+                if startpos[i] - peak_content[-1][1] <= max_gap:
+                    # append
+                    peak_content.append((startpos[i],
+                                         endpos[i],
+                                         score[indices[i]]))
+                else:
+                    # close
+                    if peak_content[-1][1] - peak_content[0][0] >= min_length:
+                        mean_logLR = self.mean_from_peakcontent(peak_content)
+                        # if peak_content[0][0] == 22414956:
+                        #    print(f"{peak_content} {mean_logLR}")
+                        peaks.add(chrom,
+                                  peak_content[0][0],
+                                  peak_content[-1][1],
+                                  summit=-1,
+                                  peak_score=mean_logLR,
+                                  pileup=0,
+                                  pscore=0,
+                                  fold_change=0,
+                                  qscore=0,
+                                  )
+                    peak_content = [(startpos[i],
+                                     endpos[i],
+                                     score[indices[i]]),]
+
+            # save the last peak
+            if peak_content:
+                if peak_content[-1][1] - peak_content[0][0] >= min_length:
+                    mean_logLR = self.mean_from_peakcontent(peak_content)
+                    peaks.add(chrom,
+                              peak_content[0][0],
+                              peak_content[-1][1],
+                              summit=-1,
+                              peak_score=mean_logLR,
+                              pileup=0,
+                              pscore=0,
+                              fold_change=0,
+                              qscore=0,
+                              )
+
+        return
+
+    @cython.cfunc
+    def mean_from_peakcontent(self,
+                              peakcontent: list) -> cython.float:
+        """
+
+        """
+        tmp_s: cython.int
+        tmp_e: cython.int
+        ln: cython.int
+        tmp_v: cython.long
+        sum_v: cython.long      # for better precision
+        r: cython.float
+        i: cython.int
+
+        ln = 0
+        sum_v = 0                         # initialize sum_v as 0
+        for i in range(len(peakcontent)):
+            tmp_s = peakcontent[i][0]
+            tmp_e = peakcontent[i][1]
+            tmp_v = peakcontent[i][2]
+            sum_v += tmp_v * (tmp_e - tmp_s)
+            ln += tmp_e - tmp_s
+
+        r = cython.cast(cython.float, (sum_v / ln))
+        return r
+
+    @cython.cfunc
+    def total(self) -> cython.long:
+        """Return the number of regions in this object.
+
+        """
+        t: cython.long
+        chrom: bytes
+
+        t = 0
+        for chrom in sorted(self.data.keys()):
+            t += self.datalength[chrom]
+        return t
diff --git a/MACS3/Signal/ScoreTrack.pyx b/MACS3/Signal/ScoreTrack.pyx
deleted file mode 100644
index 0426b18a..00000000
--- a/MACS3/Signal/ScoreTrack.pyx
+++ /dev/null
@@ -1,1483 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2024-05-14 12:06:19 Tao Liu>
-
-"""Module for Feature IO classes.
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file LICENSE included with
-the distribution).
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-from copy import copy
-from functools import reduce
-
-# ------------------------------------
-# MACS3 modules
-# ------------------------------------
-from MACS3.Signal.SignalProcessing import maxima, enforce_valleys, enforce_peakyness
-from MACS3.Signal.Prob import poisson_cdf
-from MACS3.IO.PeakIO import PeakIO, BroadPeakIO, parse_peakname
-
-# ------------------------------------
-# Other modules
-# ------------------------------------
-cimport cython
-import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-from cpython cimport bool
-from cykhash import PyObjectMap, Float32to32Map
-
-# ------------------------------------
-# C lib
-# ------------------------------------
-from libc.math cimport log10,log, floor, ceil
-
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "scoreTrack $Revision$"
-__author__ = "Tao Liu <vladimir.liu@gmail.com>"
-__doc__ = "scoreTrack classes"
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-cdef inline int32_t int_max(int32_t a, int32_t b): return a if a >= b else b
-cdef inline int32_t int_min(int32_t a, int32_t b): return a if a <= b else b
-
-LOG10_E = 0.43429448190325176
-
-pscore_dict = PyObjectMap()
-
-cdef float32_t get_pscore ( int32_t observed, float32_t expectation ):
-    """Get p-value score from Poisson test. First check existing
-    table, if failed, call poisson_cdf function, then store the result
-    in table.
-
-    """
-    cdef:
-        float64_t score
-
-    try:
-        return pscore_dict[(observed, expectation)]
-    except KeyError:
-        score = -1*poisson_cdf(observed,expectation,False,True)
-        pscore_dict[(observed, expectation)] = score
-        return score
-
-asym_logLR_dict = PyObjectMap()
-
-cdef float32_t logLR_asym ( float32_t x, float32_t y ):
-    """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
-    chromatin bias ). Set minus sign for depletion.
-
-    *asymmetric version*
-
-    """
-    cdef:
-        float32_t s
-
-    if (x,y) in asym_logLR_dict:
-        return asym_logLR_dict[ ( x, y ) ]
-    else:
-        if x > y:
-            s = (x*(log(x)-log(y))+y-x)*LOG10_E
-        elif x < y:
-            s = (x*(-log(x)+log(y))-y+x)*LOG10_E
-        else:
-            s = 0
-        asym_logLR_dict[ ( x, y ) ] = s
-        return s
-
-sym_logLR_dict = PyObjectMap()
-
-cdef float32_t logLR_sym ( float32_t x, float32_t y ):
-    """Calculate log10 Likelihood between H1 ( enriched ) and H0 (
-    another enriched ). Set minus sign for H0>H1.
-
-    * symmetric version *
-
-    """
-    cdef:
-        float32_t s
-
-    if (x,y) in sym_logLR_dict:
-        return sym_logLR_dict[ ( x, y ) ]
-    else:
-        if x > y:
-            s = (x*(log(x)-log(y))+y-x)*LOG10_E
-        elif y > x:
-            s = (y*(log(x)-log(y))+y-x)*LOG10_E
-        else:
-            s = 0
-        sym_logLR_dict[ ( x, y ) ] = s
-        return s
-
-cdef float32_t get_logFE ( float32_t x, float32_t y ):
-    """ return 100* log10 fold enrichment with +1 pseudocount.
-    """
-    return log10( x/y )
-
-cdef float32_t get_subtraction ( float32_t x, float32_t y):
-    """ return subtraction.
-    """
-    return x - y
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-
-cdef class ScoreTrackII:
-    """Class for a container to keep signals of each genomic position,
-    including 1. score, 2. treatment and 2. control pileup.
-
-    It also contains scoring methods and call_peak functions.
-    """
-    cdef:
-        dict data                       # dictionary for data of each chromosome
-        dict datalength                 # length of data array of each chromosome
-        bool trackline                  # whether trackline should be saved in bedGraph
-        float32_t treat_edm             # seq depth in million of treatment
-        float32_t ctrl_edm              # seq depth in million of control
-        char scoring_method             # method for calculating scores.
-        char normalization_method       # scale to control? scale to treatment? both scale to 1million reads?
-        float32_t pseudocount           # the pseudocount used to calcuate logLR, FE or logFE
-        float32_t cutoff
-        dict pvalue_stat                # save pvalue<->length dictionary
-
-
-    def __init__ (self, float32_t treat_depth, float32_t ctrl_depth, float32_t pseudocount = 1.0 ):
-        """Initialize.
-
-        treat_depth and ctrl_depth are effective depth in million:
-                                    sequencing depth in million after
-                                    duplicates being filtered. If
-                                    treatment is scaled down to
-                                    control sample size, then this
-                                    should be control sample size in
-                                    million. And vice versa.
-
-        pseudocount: a pseudocount used to calculate logLR, FE or
-                     logFE. Please note this value will not be changed
-                     with normalization method. So if you really want
-                     to set pseudocount 1 per million reads, set it
-                     after you normalize treat and control by million
-                     reads by `change_normalizetion_method(ord('M'))`.
-
-        """
-        self.data = {}           # for each chromosome, there is a l*4
-                                 # matrix. First column: end position
-                                 # of a region; Second: treatment
-                                 # pileup; third: control pileup ;
-                                 # forth: score ( can be
-                                 # p/q-value/likelihood
-                                 # ratio/fold-enrichment/subtraction
-                                 # depending on -c setting)
-        self.datalength = {}
-        self.trackline = False
-        self.treat_edm = treat_depth
-        self.ctrl_edm = ctrl_depth
-        #scoring_method:  p: -log10 pvalue;
-        #                 q: -log10 qvalue;
-        #                 l: log10 likelihood ratio ( minus for depletion )
-        #                 f: log10 fold enrichment
-        #                 F: linear fold enrichment
-        #                 d: subtraction
-        #                 m: fragment pileup per million reads
-        #                 N: not set
-        self.scoring_method = ord("N")
-
-        #normalization_method: T: scale to depth of treatment;
-        #                      C: scale to depth of control;
-        #                      M: scale to depth of 1 million;
-        #                      N: not set/ raw pileup
-        self.normalization_method = ord("N")
-
-        self.pseudocount = pseudocount
-        self.pvalue_stat = {}
-
-    cpdef set_pseudocount( self, float32_t pseudocount ):
-        self.pseudocount = pseudocount
-
-    cpdef enable_trackline( self ):
-        """Turn on trackline with bedgraph output
-        """
-        self.trackline = True
-
-    cpdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
-        """
-        chrom: chromosome name
-        chrom_max_len: maximum number of data points in this chromosome
-
-        """
-        if chrom not in self.data:
-            #self.data[chrom] = np.zeros( ( chrom_max_len, 4 ), dtype="int32" ) # remember col #2-4 is actual value * 100, I use integer here.
-            self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
-                                 np.zeros( chrom_max_len, dtype="float32" ), # pileup at each interval, in float32 format
-                                 np.zeros( chrom_max_len, dtype="float32" ), # control at each interval, in float32 format
-                                 np.zeros( chrom_max_len, dtype="float32" ) ] # score at each interval, in float32 format
-            self.datalength[chrom] = 0
-
-    cpdef add (self, bytes chromosome, int32_t endpos, float32_t chip, float32_t control):
-        """Add a chr-endpos-sample-control block into data
-        dictionary.
-
-        chromosome: chromosome name in string
-        endpos    : end position of each interval in integer
-        chip      : ChIP pileup value of each interval in float
-        control   : Control pileup value of each interval in float
-
-        *Warning* Need to add regions continuously.
-        """
-        cdef int32_t i
-        i = self.datalength[chromosome]
-        c = self.data[chromosome]
-        c[0][ i ] = endpos
-        c[1][ i ] = chip
-        c[2][ i ] = control
-        self.datalength[chromosome] += 1
-
-    cpdef finalize ( self ):
-        """
-        Adjust array size of each chromosome.
-
-        """
-        cdef:
-            bytes chrom, k
-            int32_t l
-
-        for chrom in sorted(self.data.keys()):
-            d = self.data[chrom]
-            l = self.datalength[chrom]
-            d[0].resize( l, refcheck = False )
-            d[1].resize( l, refcheck = False )
-            d[2].resize( l, refcheck = False )
-            d[3].resize( l, refcheck = False )
-        return
-
-    cpdef get_data_by_chr (self, bytes chromosome):
-        """Return array of counts by chromosome.
-
-        The return value is a tuple:
-        ([end pos],[value])
-        """
-        if chromosome in self.data:
-            return self.data[chromosome]
-        else:
-            return None
-
-    cpdef get_chr_names (self):
-        """Return all the chromosome names stored.
-
-        """
-        l = set(self.data.keys())
-        return l
-
-    cpdef change_normalization_method ( self, char normalization_method ):
-        """Change/set normalization method. However, I do not
-        recommend change this back and forward, since some precision
-        issue will happen -- I only keep two digits.
-
-        normalization_method: T: scale to depth of treatment;
-                              C: scale to depth of control;
-                              M: scale to depth of 1 million;
-                              N: not set/ raw pileup
-        """
-        if normalization_method == ord('T'):
-            if self.normalization_method == ord('T'): # do nothing
-                pass
-            elif self.normalization_method == ord('C'):
-                self.normalize( self.treat_edm/self.ctrl_edm, self.treat_edm/self.ctrl_edm )
-            elif  self.normalization_method == ord('M'):
-                self.normalize( self.treat_edm, self.treat_edm )
-            elif self.normalization_method == ord('N'):
-                self.normalize( 1, self.treat_edm/self.ctrl_edm )
-            else:
-                raise NotImplemented
-            self.normalization_method = ord('T')
-        elif normalization_method == ord('C'):
-            if self.normalization_method == ord('T'):
-                self.normalize( self.ctrl_edm/self.treat_edm, self.ctrl_edm/self.treat_edm )
-            elif self.normalization_method == ord('C'): # do nothing
-                pass
-            elif  self.normalization_method == ord('M'):
-                self.normalize( self.ctrl_edm, self.ctrl_edm )
-            elif self.normalization_method == ord('N'):
-                self.normalize( self.ctrl_edm/self.treat_edm, 1 )
-            else:
-                raise NotImplemented
-            self.normalization_method = ord('C')
-        elif normalization_method == ord('M'):
-            if self.normalization_method == ord('T'):
-                self.normalize( 1/self.treat_edm, 1/self.treat_edm )
-            elif self.normalization_method == ord('C'):
-                self.normalize( 1/self.ctrl_edm, 1/self.ctrl_edm )
-            elif  self.normalization_method == ord('M'): # do nothing
-                pass
-            elif self.normalization_method == ord('N'):
-                self.normalize( 1/self.treat_edm, 1/self.ctrl_edm )
-            else:
-                raise NotImplemented
-            self.normalization_method = ord('M')
-        elif normalization_method == ord('N'):
-            if self.normalization_method == ord('T'):
-                self.normalize( self.treat_edm, self.treat_edm )
-            elif self.normalization_method == ord('C'):
-                self.normalize( self.ctrl_edm, self.ctrl_edm )
-            elif  self.normalization_method == ord('M'):
-                self.normalize( self.treat_edm, self.ctrl_edm )
-            elif self.normalization_method == ord('N'): # do nothing
-                pass
-            else:
-                raise NotImplemented
-            self.normalization_method = ord('N')
-
-    cdef normalize ( self, float32_t treat_scale, float32_t control_scale ):
-        cdef:
-            np.ndarray p, c
-            int64_t l, i
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            l = self.datalength[chrom]
-            for i in range(l):
-                p[ i ] *= treat_scale
-                c[ i ] *= control_scale
-        return
-
-    cpdef change_score_method (self, char scoring_method):
-        """
-        scoring_method:  p: -log10 pvalue;
-                         q: -log10 qvalue;
-                         l: log10 likelihood ratio ( minus for depletion )
-			 s: symmetric log10 likelihood ratio ( for comparing two ChIPs )
-                         f: log10 fold enrichment
-                         F: linear fold enrichment
-                         d: subtraction
-                         M: maximum
-                         m: fragment pileup per million reads
-        """
-        if scoring_method == ord('p'):
-            self.compute_pvalue()
-        elif scoring_method == ord('q'):
-            #if not already calculated p, compute pvalue first
-            if self.scoring_method != ord('p'):
-                self.compute_pvalue()
-            self.compute_qvalue()
-        elif scoring_method == ord('l'):
-            self.compute_likelihood()
-        elif scoring_method == ord('s'):
-            self.compute_sym_likelihood()
-        elif scoring_method == ord('f'):
-            self.compute_logFE()
-        elif scoring_method == ord('F'):
-            self.compute_foldenrichment()
-        elif scoring_method == ord('d'):
-            self.compute_subtraction()
-        elif scoring_method == ord('m'):
-            self.compute_SPMR()
-        elif scoring_method == ord('M'):
-            self.compute_max()
-        else:
-            raise NotImplemented
-
-    cdef compute_pvalue ( self ):
-        """Compute -log_{10}(pvalue)
-        """
-        cdef:
-            np.ndarray[np.float32_t] p, c, v
-            np.ndarray[np.int32_t] pos
-            int64_t l, i, prev_pos
-            bytes chrom
-
-        for chrom in sorted(self.data.keys()):
-            prev_pos = 0
-            pos = self.data[chrom][0]
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] =  get_pscore( <int32_t>(p[ i ]  + self.pseudocount) , c[ i ]  + self.pseudocount )
-                try:
-                    self.pvalue_stat[v[ i ]] += pos[ i ] - prev_pos
-                except:
-                    self.pvalue_stat[v[ i ]] = pos[ i ] - prev_pos
-                prev_pos = pos[ i ]
-
-        self.scoring_method = ord('p')
-        return
-
-    cdef compute_qvalue ( self ):
-        """Compute -log_{10}(qvalue)
-        """
-        cdef:
-            object pqtable
-            int64_t i,l,j
-            bytes chrom
-            np.ndarray p, c, v
-
-        # pvalue should be computed first!
-        assert self.scoring_method == ord('p')
-        # make pqtable
-        pqtable = self.make_pq_table()
-
-        # convert p to q
-        for chrom in sorted(self.data.keys()):
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] = pqtable[ v[ i ] ]
-                #v [ i ] =  g( v[ i ])
-
-        self.scoring_method = ord('q')
-        return
-
-    cpdef object make_pq_table ( self ):
-        """Make pvalue-qvalue table.
-
-        Step1: get all pvalue and length of block with this pvalue
-        Step2: Sort them
-        Step3: Apply AFDR method to adjust pvalue and get qvalue for each pvalue
-
-        Return a dictionary of {-log10pvalue:(-log10qvalue,rank,basepairs)} relationships.
-        """
-        cdef:
-            int64_t n, pre_p, this_p, length, pre_l, l, i, j
-            float32_t this_v, pre_v, v, q, pre_q # store the p and q scores
-            int64_t N, k
-            float32_t f
-            bytes chrom
-            np.ndarray v_chrom, pos_chrom
-            object pvalue2qvalue
-            dict pvalue_stat
-            list unique_values
-
-        assert self.scoring_method == ord('p')
-
-        pvalue_stat = self.pvalue_stat
-
-        N = sum(pvalue_stat.values())
-        k = 1                           # rank
-        f = -log10(N)
-        pre_v = -2147483647
-        pre_l = 0
-        pre_q = 2147483647              # save the previous q-value
-
-        pvalue2qvalue = Float32to32Map( for_int = False )
-        unique_values = sorted(list(pvalue_stat.keys()), reverse=True)
-        for i in range(len(unique_values)):
-            v = unique_values[i]
-            l = pvalue_stat[v]
-            q = v + (log10(k) + f)
-            if q > pre_q:
-                q = pre_q
-            if q <= 0:
-                q = 0
-                break
-            pvalue2qvalue[ v ] = q
-            pre_q = q
-            k+=l
-        # bottom rank pscores all have qscores 0
-        for j in range(i, len(unique_values) ):
-            v = unique_values[ j ]
-            pvalue2qvalue[ v ] = 0
-        return pvalue2qvalue
-
-    cdef compute_likelihood ( self ):
-        """Calculate log10 likelihood.
-
-        """
-        cdef:
-            #np.ndarray v, p, c
-            int64_t l, i
-            bytes chrom
-            float32_t v1, v2
-            float32_t pseudocount
-
-        pseudocount = self.pseudocount
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][ 1 ].flat.__next__ # pileup in treatment
-            c = self.data[chrom][ 2 ].flat.__next__ # pileup in control
-            v = self.data[chrom][ 3 ]               # score
-            l = self.datalength[chrom]
-            v1 = 2
-            v2 = 1
-            for i in range(l):
-                v1 = p()
-                v2 = c()
-                v[ i ] =  logLR_asym( v1 + pseudocount, v2 + pseudocount )  #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
-                #print v1, v2, v[i]
-        self.scoring_method = ord('l')
-        return
-
-    cdef compute_sym_likelihood ( self ):
-        """Calculate symmetric log10 likelihood.
-
-        """
-        cdef:
-            #np.ndarray v, p, c
-            int64_t l, i
-            bytes chrom
-            float32_t v1, v2
-            float32_t pseudocount
-
-        pseudocount = self.pseudocount
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][ 1 ].flat.__next__
-            c = self.data[chrom][ 2 ].flat.__next__
-            v = self.data[chrom][ 3 ]
-            l = self.datalength[chrom]
-            v1 = 2
-            v2 = 1
-            for i in range(l):
-                v1 = p()
-                v2 = c()
-                v[ i ] =  logLR_sym( v1 + pseudocount, v2 + pseudocount )  #logLR( d[ i, 1]/100.0, d[ i, 2]/100.0 )
-        self.scoring_method = ord('s')
-        return
-
-    cdef compute_logFE ( self ):
-        """Calculate log10 fold enrichment ( with 1 pseudocount ).
-
-        """
-        cdef:
-            np.ndarray p, c, v
-            int64_t l, i
-            float32_t pseudocount
-
-        pseudocount = self.pseudocount
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] = get_logFE ( p[ i ] + pseudocount, c[ i ] + pseudocount)
-        self.scoring_method = ord('f')
-        return
-
-    cdef compute_foldenrichment ( self ):
-        """Calculate linear scale fold enrichment ( with 1 pseudocount ).
-
-        """
-        cdef:
-            np.ndarray p, c, v
-            int64_t l, i
-            float32_t pseudocount
-
-        pseudocount = self.pseudocount
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] =  ( p[ i ] + pseudocount )/( c[ i ] + pseudocount )
-        self.scoring_method = ord('F')
-        return
-
-    cdef compute_subtraction ( self ):
-        cdef:
-            np.ndarray p, c, v
-            int64_t l, i
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] = p[ i ] - c[ i ]
-        self.scoring_method = ord('d')
-        return
-
-    cdef compute_SPMR ( self ):
-        cdef:
-            np.ndarray p, v
-            int64_t l, i
-            float32_t scale
-        if self.normalization_method == ord('T') or self.normalization_method == ord('N'):
-            scale = self.treat_edm
-        elif self.normalization_method == ord('C'):
-            scale = self.ctrl_edm
-        elif self.normalization_method == ord('M'):
-            scale = 1
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] =  p[ i ] / scale # two digit precision may not be enough...
-        self.scoring_method = ord('m')
-        return
-
-    cdef compute_max ( self ):
-        cdef:
-            np.ndarray p, c, v
-            int64_t l, i
-
-        for chrom in sorted(self.data.keys()):
-            p = self.data[chrom][1]
-            c = self.data[chrom][2]
-            v = self.data[chrom][3]
-            l = self.datalength[chrom]
-            for i in range(l):
-                v[ i ] = max(p[ i ],c[ i ])
-        self.scoring_method = ord('M')
-        return
-
-    cpdef write_bedGraph ( self, fhd, str name, str description, short column = 3):
-        """Write all data to fhd in bedGraph Format.
-
-        fhd: a filehandler to save bedGraph.
-
-        name/description: the name and description in track line.
-
-        colname: can be 1: chip, 2: control, 3: score
-
-        """
-        cdef:
-            bytes chrom
-            int32_t l, pre, i, p
-            float32_t pre_v, v
-            set chrs
-            np.ndarray pos, value
-
-        assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
-
-        write = fhd.write
-
-        if self.trackline:
-            # this line is REQUIRED by the wiggle format for UCSC browser
-            write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            pos = self.data[ chrom ][ 0 ]
-            value = self.data[ chrom ][ column ]
-            l = self.datalength[ chrom ]
-            pre = 0
-            if pos.shape[ 0 ] == 0: continue # skip if there's no data
-            pre_v = value[ 0 ]
-            for i in range( 1, l ):
-                v = value[ i ]
-                p = pos[ i-1 ]
-                #if ('%.5f' % pre_v) != ('%.5f' % v):
-                if abs(pre_v - v) > 1e-5: # precision is 5 digits
-                    write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
-                    pre_v = v
-                    pre = p
-            p = pos[ -1 ]
-            # last one
-            write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
-
-        return True
-
-    cpdef call_peaks (self, float32_t cutoff=5.0, int32_t min_length=200, int32_t max_gap=50, bool call_summits=False):
-        """This function try to find regions within which, scores
-        are continuously higher than a given cutoff.
-
-        This function is NOT using sliding-windows. Instead, any
-        regions in bedGraph above certain cutoff will be detected,
-        then merged if the gap between nearby two regions are below
-        max_gap. After this, peak is reported if its length is above
-        min_length.
-
-        cutoff:  cutoff of value, default 5. For -log10pvalue, it means 10^-5.
-        min_length :  minimum peak length, default 200.
-        max_gap   :  maximum gap to merge nearby peaks, default 50.
-        acll_summits: 
-        """
-        cdef:
-            int32_t i
-            bytes chrom
-            np.ndarray pos, sample, control, value, above_cutoff, above_cutoff_v, above_cutoff_endpos, above_cutoff_startpos, above_cutoff_sv
-            list peak_content
-
-        chrs  = self.get_chr_names()
-        peaks = PeakIO()                      # dictionary to save peaks
-
-        self.cutoff = cutoff
-        for chrom in sorted(chrs):
-            peak_content = []           # to store points above cutoff
-
-            pos = self.data[chrom][ 0 ]
-            sample = self.data[chrom][ 1 ]
-            control = self.data[chrom][ 2 ]
-            value = self.data[chrom][ 3 ]
-
-            above_cutoff = np.nonzero( value >= cutoff )[0] # indices where score is above cutoff
-            above_cutoff_v = value[above_cutoff] # scores where score is above cutoff
-
-            above_cutoff_endpos = pos[above_cutoff] # end positions of regions where score is above cutoff
-            above_cutoff_startpos = pos[above_cutoff-1] # start positions of regions where score is above cutoff
-            above_cutoff_sv= sample[above_cutoff] # sample pileup height where score is above cutoff
-
-            if above_cutoff_v.size == 0:
-                # nothing above cutoff
-                continue
-
-            if above_cutoff[0] == 0:
-                # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
-                above_cutoff_startpos[0] = 0
-
-            # first bit of region above cutoff
-            peak_content.append( (above_cutoff_startpos[0], above_cutoff_endpos[0], above_cutoff_v[0], above_cutoff_sv[0], above_cutoff[0]) )
-            for i in range( 1,above_cutoff_startpos.size ):
-                if above_cutoff_startpos[i] - peak_content[-1][1] <= max_gap:
-                    # append
-                    peak_content.append( (above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]) )
-                else:
-                    # close
-                    if call_summits:
-                        self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
-                    else:
-                        self.__close_peak(peak_content, peaks, min_length, chrom )
-                    peak_content = [(above_cutoff_startpos[i], above_cutoff_endpos[i], above_cutoff_v[i], above_cutoff_sv[i], above_cutoff[i]),]
-
-            # save the last peak
-            if not peak_content:
-                continue
-            else:
-                if call_summits:
-                    self.__close_peak2(peak_content, peaks, min_length, chrom, max_gap//2 )
-                else:
-                    self.__close_peak(peak_content, peaks, min_length, chrom )
-
-        return peaks
-
-    cdef bool __close_peak (self, list peak_content, object peaks, int32_t min_length,
-                            bytes chrom):
-        """Close the peak region, output peak boundaries, peak summit
-        and scores, then add the peak to peakIO object.
-
-        In this function, we define the peak summit as the middle
-        point of the region with the highest score, in this peak. For
-        example, if the region of the highest score is from 100 to
-        200, the summit is 150. If there are several regions of the
-        same 'highest score', we will first calculate the possible
-        summit for each such region, then pick a position close to the
-        middle index ( = (len(highest_regions) + 1) / 2 ) of these
-        summits. For example, if there are three regions with the same
-        highest scores, [100,200], [300,400], [600,700], we will first
-        find the possible summits as 150, 350, and 650, and then pick
-        the middle index, the 2nd, of the three positions -- 350 as
-        the final summit. If there are four regions, we pick the 2nd
-        as well.
-
-        peaks: a PeakIO object
-
-        """
-        cdef:
-            int32_t summit_pos, tstart, tend, tmpindex, summit_index, i, midindex
-            float32_t summit_value, tvalue, tsummitvalue
-
-        peak_length = peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ]
-        if peak_length >= min_length: # if the peak is too small, reject it
-            tsummit = []
-            summit_pos   = 0
-            summit_value = 0
-            for i in range(len(peak_content)):
-                (tstart,tend,tvalue,tsummitvalue, tindex) = peak_content[i]
-                #for (tstart,tend,tvalue,tsummitvalue, tindex) in peak_content:
-                if not summit_value or summit_value < tsummitvalue:
-                    tsummit = [(tend + tstart) / 2, ]
-                    tsummit_index = [ tindex, ]
-                    summit_value = tsummitvalue
-                elif summit_value == tsummitvalue:
-                    # remember continuous summit values
-                    tsummit.append(int((tend + tstart) / 2))
-                    tsummit_index.append( tindex )
-            # the middle of all highest points in peak region is defined as summit
-            midindex = int((len(tsummit) + 1) / 2) - 1
-            summit_pos    = tsummit[ midindex ]
-            summit_index  = tsummit_index[ midindex ]
-            if self.scoring_method == ord('q'):
-                qscore = self.data[chrom][3][ summit_index ]
-            else:
-                # if q value is not computed, use -1
-                qscore = -1
-
-            peaks.add( chrom,
-                       peak_content[0][0],
-                       peak_content[-1][1],
-                       summit      = summit_pos,
-                       peak_score  = self.data[chrom][ 3 ][ summit_index ],
-                       pileup      = self.data[chrom][ 1 ][ summit_index ], # should be the same as summit_value
-                       pscore      = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
-                       fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
-                       qscore      = qscore,
-                       )
-            # start a new peak
-            return True
-
-    cdef bool __close_peak2 (self, list peak_content, object peaks, int32_t min_length,
-                             bytes chrom, int32_t smoothlen=51,
-                             float32_t min_valley = 0.9):
-        """Close the peak region, output peak boundaries, peak summit
-        and scores, then add the peak to peakIO object.
-
-        In this function, we use signal processing methods to smooth
-        the scores in the peak region, find the maxima and enforce the
-        peaky shape, and to define the best maxima as the peak
-        summit. The functions used for signal processing is 'maxima'
-        (with 2nd order polynomial filter) and 'enfoce_peakyness'
-        functions in SignalProcessing.pyx.
-
-        peaks: a PeakIO object
-
-        """    
-        cdef:
-            int32_t summit_pos, tstart, tend, tmpindex, summit_index, summit_offset
-            int32_t start, end, i, j, start_boundary
-            float32_t summit_value, tvalue, tsummitvalue
-#            np.ndarray[np.float32_t, ndim=1] w
-            np.ndarray[np.float32_t, ndim=1] peakdata
-            np.ndarray[np.int32_t, ndim=1] peakindices, summit_offsets
-
-        # Add 10 bp padding to peak region so that we can get true minima
-        end = peak_content[ -1 ][ 1 ] + 10
-        start = peak_content[ 0 ][ 0 ] - 10
-        if start < 0:
-            start_boundary = 10 + start
-            start = 0
-        else:
-            start_boundary = 10
-        peak_length = end - start
-        if end - start < min_length: return # if the region is too small, reject it
-
-        peakdata = np.zeros(end - start, dtype='float32')
-        peakindices = np.zeros(end - start, dtype='int32')
-        for (tstart,tend,tvalue,tsvalue, tmpindex) in peak_content:
-            i = tstart - start + start_boundary
-            j = tend - start + start_boundary
-            peakdata[i:j] = tsvalue
-            peakindices[i:j] = tmpindex
-        summit_offsets = maxima(peakdata, smoothlen)
-        if summit_offsets.shape[0] == 0:
-            # **failsafe** if no summits, fall back on old approach #
-            return self.__close_peak(peak_content, peaks, min_length, chrom)
-        else:
-            # remove maxima that occurred in padding
-            i = np.searchsorted(summit_offsets, start_boundary)
-            j = np.searchsorted(summit_offsets, peak_length + start_boundary, 'right')
-            summit_offsets = summit_offsets[i:j]
-
-        summit_offsets = enforce_peakyness(peakdata, summit_offsets)
-        if summit_offsets.shape[0] == 0:
-            # **failsafe** if no summits, fall back on old approach #
-            return self.__close_peak(peak_content, peaks, min_length, chrom)
-
-        summit_indices = peakindices[summit_offsets]
-        summit_offsets -= start_boundary
-
-        peak_scores  = self.data[chrom][3][ summit_indices ]
-        if not (peak_scores > self.cutoff).all():
-            return self.__close_peak(peak_content, peaks, min_length, chrom)
-        for summit_offset, summit_index in zip(summit_offsets, summit_indices):
-            if self.scoring_method == ord('q'):
-                qscore = self.data[chrom][3][ summit_index ]
-            else:
-                # if q value is not computed, use -1
-                qscore = -1
-            peaks.add( chrom,
-                       start,
-                       end,
-                       summit      = start + summit_offset,
-                       peak_score  = self.data[chrom][3][ summit_index ],
-                       pileup      = self.data[chrom][1][ summit_index ], # should be the same as summit_value
-                       pscore      = get_pscore(self.data[chrom][ 1 ][ summit_index ], self.data[chrom][ 2 ][ summit_index ]),
-                       fold_change = ( self.data[chrom][ 1 ][ summit_index ] + self.pseudocount ) / ( self.data[chrom][ 2 ][ summit_index ] + self.pseudocount ),
-                       qscore      = qscore,
-                       )
-        # start a new peak
-        return True
-
-    cdef int64_t total ( self ):
-        """Return the number of regions in this object.
-
-        """
-        cdef:
-            int64_t t
-            bytes chrom
-
-        t = 0
-        for chrom in sorted(self.data.keys()):
-            t += self.datalength[chrom]
-        return t
-
-    cpdef call_broadpeaks (self, float32_t lvl1_cutoff=5.0, float32_t lvl2_cutoff=1.0, int32_t min_length=200, int32_t lvl1_max_gap=50, int32_t lvl2_max_gap=400):
-        """This function try to find enriched regions within which,
-        scores are continuously higher than a given cutoff for level
-        1, and link them using the gap above level 2 cutoff with a
-        maximum length of lvl2_max_gap.
-
-        lvl1_cutoff:  cutoff of value at enriched regions, default 5.0.
-        lvl2_cutoff:  cutoff of value at linkage regions, default 1.0.
-        min_length :  minimum peak length, default 200.
-        lvl1_max_gap   :  maximum gap to merge nearby enriched peaks, default 50.
-        lvl2_max_gap   :  maximum length of linkage regions, default 400.
-
-        Return both general PeakIO object for highly enriched regions
-        and gapped broad regions in BroadPeakIO.
-        """
-        cdef:
-            int32_t i
-            bytes chrom
-
-        assert lvl1_cutoff > lvl2_cutoff, "level 1 cutoff should be larger than level 2."
-        assert lvl1_max_gap < lvl2_max_gap, "level 2 maximum gap should be larger than level 1."
-        lvl1_peaks = self.call_peaks(cutoff=lvl1_cutoff, min_length=min_length, max_gap=lvl1_max_gap)
-        lvl2_peaks = self.call_peaks(cutoff=lvl2_cutoff, min_length=min_length, max_gap=lvl2_max_gap)
-        chrs = lvl1_peaks.peaks.keys()
-        broadpeaks = BroadPeakIO()
-        # use lvl2_peaks as linking regions between lvl1_peaks
-        for chrom in sorted(chrs):
-            lvl1peakschrom = lvl1_peaks.peaks[chrom]
-            lvl2peakschrom = lvl2_peaks.peaks[chrom]
-            lvl1peakschrom_next = iter(lvl1peakschrom).__next__
-            tmppeakset = []             # to temporarily store lvl1 region inside a lvl2 region
-            # our assumption is lvl1 regions should be included in lvl2 regions
-            try:
-                lvl1 = lvl1peakschrom_next()
-                for i in range( len(lvl2peakschrom) ):
-                    # for each lvl2 peak, find all lvl1 peaks inside
-                    # I assume lvl1 peaks can be ALL covered by lvl2 peaks.
-                    lvl2 = lvl2peakschrom[i]
-
-                    while True:
-                        if lvl2["start"] <= lvl1["start"]  and lvl1["end"] <= lvl2["end"]:
-                            tmppeakset.append(lvl1)
-                            lvl1 = lvl1peakschrom_next()
-                        else:
-                            # make a hierarchical broad peak
-                            #print lvl2["start"], lvl2["end"], lvl2["score"]
-                            self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                            tmppeakset = []
-                            break
-            except StopIteration:
-                # no more strong (aka lvl1) peaks left
-                self.__add_broadpeak ( broadpeaks, chrom, lvl2, tmppeakset)
-                tmppeakset = []
-                # add the rest lvl2 peaks
-                for j in range( i+1, len(lvl2peakschrom) ):
-                    self.__add_broadpeak( broadpeaks, chrom, lvl2peakschrom[j], tmppeakset )
-
-        return broadpeaks
-
-    def __add_broadpeak (self, bpeaks, bytes chrom, dict lvl2peak, list lvl1peakset):
-        """Internal function to create broad peak.
-        """
-
-        cdef:
-            int32_t blockNum, thickStart, thickEnd, start, end
-            bytes blockSizes, blockStarts
-
-        start      = lvl2peak["start"]
-        end        = lvl2peak["end"]
-
-        # the following code will add those broad/lvl2 peaks with no strong/lvl1 peaks inside
-        if not lvl1peakset:
-            # will complement by adding 1bps start and end to this region
-            # may change in the future if gappedPeak format was improved.
-            bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=(b"%d" % start), thickEnd=(b"%d" % end),
-                       blockNum = 2, blockSizes = b"1,1", blockStarts = (b"0,%d" % (end-start-1)), pileup = lvl2peak["pileup"],
-                       pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                       qscore = lvl2peak["qscore"] )
-            return bpeaks
-
-        thickStart = b"%d" % lvl1peakset[0]["start"]
-        thickEnd   = b"%d" % lvl1peakset[-1]["end"]
-        blockNum   = int(len(lvl1peakset))
-        blockSizes = b",".join( [b"%d" % x["length"] for x in lvl1peakset] )
-        blockStarts = b",".join( [b"%d" % (x["start"]-start) for x in lvl1peakset] )
-
-        if lvl2peak["start"] != thickStart:
-            # add 1bp mark for the start of lvl2 peak
-            thickStart = b"%d" % start
-            blockNum += 1
-            blockSizes = b"1,"+blockSizes
-            blockStarts = b"0,"+blockStarts
-        if lvl2peak["end"] != thickEnd:
-            # add 1bp mark for the end of lvl2 peak
-            thickEnd = b"%d" % end
-            blockNum += 1
-            blockSizes = blockSizes+b",1"
-            blockStarts = blockStarts + b"," + (b"%d" % (end-start-1))
-
-        # add to BroadPeakIO object
-        bpeaks.add(chrom, start, end, score=lvl2peak["score"], thickStart=thickStart, thickEnd=thickEnd,
-                   blockNum = blockNum, blockSizes = blockSizes, blockStarts = blockStarts,  pileup = lvl2peak["pileup"],
-                   pscore = lvl2peak["pscore"], fold_change = lvl2peak["fc"],
-                   qscore = lvl2peak["qscore"] )
-        return bpeaks
-
-cdef class TwoConditionScores:
-    """Class for saving two condition comparison scores.
-    """
-    cdef:
-        dict data                       # dictionary for data of each chromosome
-        dict datalength                 # length of data array of each chromosome
-        float32_t cond1_factor              # factor to apply to cond1 pileup values
-        float32_t cond2_factor              # factor to apply to cond2 pileup values
-        float32_t pseudocount               # the pseudocount used to calcuate LLR
-        float32_t cutoff
-        object t1bdg, c1bdg, t2bdg, c2bdg
-        dict pvalue_stat1, pvalue_stat2, pvalue_stat3
-
-    def __init__ (self, t1bdg, c1bdg, t2bdg, c2bdg, float32_t cond1_factor = 1.0, float32_t cond2_factor = 1.0, float32_t pseudocount = 0.01, float32_t proportion_background_empirical_distribution = 0.99999 ):
-        """
-        t1bdg: a bedGraphTrackI object for treat 1
-        c1bdg: a bedGraphTrackI object for control 1
-        t2bdg: a bedGraphTrackI object for treat 2
-        c2bdg: a bedGraphTrackI object for control 2
-
-        cond1_factor: this will be multiplied to values in t1bdg and c1bdg
-        cond2_factor: this will be multiplied to values in t2bdg and c2bdg
-
-        pseudocount: pseudocount, by default 0.01.
-
-        proportion_background_empirical_distribution: proportion of genome as the background to build empirical distribution
-
-        """
-
-        self.data = {}           # for each chromosome, there is a l*4
-                                 # matrix. First column: end position
-                                 # of a region; Second: treatment
-                                 # pileup; third: control pileup ;
-                                 # forth: score ( can be
-                                 # p/q-value/likelihood
-                                 # ratio/fold-enrichment/subtraction
-                                 # depending on -c setting)
-        self.datalength = {}
-        self.cond1_factor = cond1_factor
-        self.cond2_factor = cond2_factor
-        self.pseudocount = pseudocount
-        self.pvalue_stat1 = {}
-        self.pvalue_stat2 = {}
-        self.t1bdg = t1bdg
-        self.c1bdg = c1bdg
-        self.t2bdg = t2bdg
-        self.c2bdg = c2bdg
-
-        #self.empirical_distr_llr = [] # save all values in histogram
-
-    cpdef set_pseudocount( self, float32_t pseudocount ):
-        self.pseudocount = pseudocount
-
-    cpdef build ( self ):
-        """Compute scores from 3 types of comparisons and store them in self.data.
-
-        """
-        cdef:
-            set common_chrs
-            bytes chrname
-            int32_t chrom_max_len
-        # common chromosome names
-        common_chrs = self.get_common_chrs()
-        for chrname in common_chrs:
-            (cond1_treat_ps, cond1_treat_vs) = self.t1bdg.get_data_by_chr(chrname)
-            (cond1_control_ps, cond1_control_vs) = self.c1bdg.get_data_by_chr(chrname)
-            (cond2_treat_ps, cond2_treat_vs) = self.t2bdg.get_data_by_chr(chrname)
-            (cond2_control_ps, cond2_control_vs) = self.c2bdg.get_data_by_chr(chrname)
-            chrom_max_len = len(cond1_treat_ps) + len(cond1_control_ps) +\
-                            len(cond2_treat_ps) + len(cond2_control_ps)
-            self.add_chromosome( chrname, chrom_max_len )
-            self.build_chromosome( chrname,
-                                   cond1_treat_ps, cond1_control_ps,
-                                   cond2_treat_ps, cond2_control_ps,
-                                   cond1_treat_vs, cond1_control_vs,
-                                   cond2_treat_vs, cond2_control_vs )
-
-
-    cdef build_chromosome( self, chrname,
-                           cond1_treat_ps, cond1_control_ps,
-                           cond2_treat_ps, cond2_control_ps,
-                           cond1_treat_vs, cond1_control_vs,
-                           cond2_treat_vs, cond2_control_vs ):
-        """Internal function to calculate scores for three types of comparisons.
-
-        cond1_treat_ps, cond1_control_ps: position of treat and control of condition 1
-        cond2_treat_ps, cond2_control_ps: position of treat and control of condition 2
-        cond1_treat_vs, cond1_control_vs: value of treat and control of condition 1
-        cond2_treat_vs, cond2_control_vs: value of treat and control of condition 2
-
-        """
-        cdef:
-            int32_t c1tp, c1cp, c2tp, c2cp, minp, pre_p
-            float32_t c1tv, c1cv, c2tv, c2cv 
-        c1tpn = iter(cond1_treat_ps).__next__
-        c1cpn = iter(cond1_control_ps).__next__
-        c2tpn = iter(cond2_treat_ps).__next__
-        c2cpn = iter(cond2_control_ps).__next__
-        c1tvn = iter(cond1_treat_vs).__next__
-        c1cvn = iter(cond1_control_vs).__next__
-        c2tvn = iter(cond2_treat_vs).__next__
-        c2cvn = iter(cond2_control_vs).__next__
-
-        pre_p = 0
-
-        try:
-            c1tp = c1tpn()
-            c1tv = c1tvn()
-
-            c1cp = c1cpn()
-            c1cv = c1cvn()
-
-            c2tp = c2tpn()
-            c2tv = c2tvn()
-
-            c2cp = c2cpn()
-            c2cv = c2cvn()
-
-            while True:
-                minp = min(c1tp, c1cp, c2tp, c2cp)
-                self.add( chrname, pre_p, c1tv, c1cv, c2tv, c2cv )
-                pre_p = minp
-                if c1tp == minp:
-                    c1tp = c1tpn()
-                    c1tv = c1tvn()
-                if c1cp == minp:
-                    c1cp = c1cpn()
-                    c1cv = c1cvn()
-                if c2tp == minp:
-                    c2tp = c2tpn()
-                    c2tv = c2tvn()
-                if c2cp == minp:
-                    c2cp = c2cpn()
-                    c2cv = c2cvn()
-        except StopIteration:
-            # meet the end of either bedGraphTrackI, simply exit
-            pass
-        return
-
-    cdef set get_common_chrs ( self ):
-        cdef:
-            set t1chrs, c1chrs, t2chrs, c2chrs, common
-        t1chrs = self.t1bdg.get_chr_names()
-        c1chrs = self.c1bdg.get_chr_names()
-        t2chrs = self.t2bdg.get_chr_names()
-        c2chrs = self.c2bdg.get_chr_names()
-        common = reduce(lambda x,y:x.intersection(y), (t1chrs,c1chrs,t2chrs,c2chrs))
-        return common
-
-    cdef add_chromosome ( self, bytes chrom, int32_t chrom_max_len ):
-        """
-        chrom: chromosome name
-        chrom_max_len: maximum number of data points in this chromosome
-
-        """
-        if chrom not in self.data:
-            self.data[chrom] = [ np.zeros( chrom_max_len, dtype="int32" ), # pos
-                                 np.zeros( chrom_max_len, dtype="float32" ), # LLR t1 vs c1
-                                 np.zeros( chrom_max_len, dtype="float32" ), # LLR t2 vs c2
-                                 np.zeros( chrom_max_len, dtype="float32" )] # LLR t1 vs t2
-            self.datalength[chrom] = 0
-
-    cdef add (self, bytes chromosome, int32_t endpos, float32_t t1, float32_t c1, float32_t t2, float32_t c2):
-        """Take chr-endpos-sample1-control1-sample2-control2 and
-        compute logLR for t1 vs c1, t2 vs c2, and t1 vs t2, then save
-        values.
-
-        chromosome: chromosome name in string
-        endpos    : end position of each interval in integer
-        t1        : Sample 1 ChIP pileup value of each interval in float
-        c1        : Sample 1 Control pileup value of each interval in float
-        t2        : Sample 2 ChIP pileup value of each interval in float
-        c2        : Sample 2 Control pileup value of each interval in float
-
-        *Warning* Need to add regions continuously.
-        """
-        cdef:
-            int32_t i
-            list c
-        i = self.datalength[chromosome]
-        c = self.data[chromosome]
-        c[0][ i ] = endpos
-        c[1][ i ] = logLR_asym( (t1+self.pseudocount)*self.cond1_factor, (c1+self.pseudocount)*self.cond1_factor )
-        c[2][ i ] = logLR_asym( (t2+self.pseudocount)*self.cond2_factor, (c2+self.pseudocount)*self.cond2_factor )
-        c[3][ i ] = logLR_sym( (t1+self.pseudocount)*self.cond1_factor, (t2+self.pseudocount)*self.cond2_factor )
-        self.datalength[chromosome] += 1
-        return
-
-    cpdef finalize ( self ):
-        """
-        Adjust array size of each chromosome.
-
-        """
-        cdef:
-            bytes chrom
-            int32_t l
-            list d
-
-        for chrom in sorted(self.data.keys()):
-            d = self.data[chrom]
-            l = self.datalength[chrom]
-            d[0].resize( l, refcheck = False )
-            d[1].resize( l, refcheck = False )
-            d[2].resize( l, refcheck = False )
-            d[3].resize( l, refcheck = False )
-        return
-
-    cpdef get_data_by_chr (self, bytes chromosome):
-        """Return array of counts by chromosome.
-
-        The return value is a tuple:
-        ([end pos],[value])
-        """
-        if chromosome in self.data:
-            return self.data[chromosome]
-        else:
-            return None
-
-    cpdef get_chr_names (self):
-        """Return all the chromosome names stored.
-
-        """
-        l = set(self.data.keys())
-        return l
-
-    cpdef write_bedGraph ( self, fhd, str name, str description, int32_t column = 3):
-        """Write all data to fhd in bedGraph Format.
-
-        fhd: a filehandler to save bedGraph.
-
-        name/description: the name and description in track line.
-
-        colname: can be 1: cond1 chip vs cond1 ctrl, 2: cond2 chip vs cond2 ctrl, 3: cond1 chip vs cond2 chip
-
-        """
-        cdef:
-            bytes chrom
-            int32_t l, pre, i, p
-            float32_t pre_v, v
-            np.ndarray pos, value
-
-        assert column in range( 1, 4 ), "column should be between 1, 2 or 3."
-
-        write = fhd.write
-
-        #if self.trackline:
-        #    # this line is REQUIRED by the wiggle format for UCSC browser
-        #    write( "track type=bedGraph name=\"%s\" description=\"%s\"\n" % ( name.decode(), description ) )
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            pos = self.data[ chrom ][ 0 ]
-            value = self.data[ chrom ][ column ]
-            l = self.datalength[ chrom ]
-            pre = 0
-            if pos.shape[ 0 ] == 0: continue # skip if there's no data
-            pre_v = value[ 0 ]
-            for i in range( 1, l ):
-                v = value[ i ]
-                p = pos[ i-1 ]
-                if abs(pre_v - v)>=1e-6:
-                    write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
-                    pre_v = v
-                    pre = p
-            p = pos[ -1 ]
-            # last one
-            write( "%s\t%d\t%d\t%.5f\n" % ( chrom.decode(), pre, p, pre_v ) )
-
-        return True
-
-    cpdef write_matrix ( self, fhd, str name, str description ):
-        """Write all data to fhd into five columns Format:
-
-        col1: chr_start_end
-        col2: t1 vs c1
-        col3: t2 vs c2
-        col4: t1 vs t2
-
-        fhd: a filehandler to save the matrix.
-
-        """
-        cdef:
-            bytes chrom
-            int32_t l, pre, i, p
-            float32_t v1, v2, v3
-            np.ndarray pos, value1, value2, value3
-
-        write = fhd.write
-
-        chrs = self.get_chr_names()
-        for chrom in sorted(chrs):
-            [ pos, value1, value2, value3 ] = self.data[ chrom ]
-            l = self.datalength[ chrom ]
-            pre = 0
-            if pos.shape[ 0 ] == 0: continue # skip if there's no data
-            for i in range( 0, l ):
-                v1 = value1[ i ]
-                v2 = value2[ i ]
-                v3 = value3[ i ]
-                p = pos[ i ]
-                write( "%s:%d_%d\t%.5f\t%.5f\t%.5f\n" % ( chrom.decode(), pre, p, v1, v2, v3 ) )
-                pre = p
-
-        return True
-
-    cpdef tuple call_peaks (self, float32_t cutoff=3, int32_t min_length=200, int32_t max_gap = 100,
-                      bool call_summits=False):
-        """This function try to find regions within which, scores
-        are continuously higher than a given cutoff.
-
-        For bdgdiff.
-
-        This function is NOT using sliding-windows. Instead, any
-        regions in bedGraph above certain cutoff will be detected,
-        then merged if the gap between nearby two regions are below
-        max_gap. After this, peak is reported if its length is above
-        min_length.
-
-        cutoff:  cutoff of value, default 3. For log10 LR, it means 1000 or -1000.
-        min_length :  minimum peak length, default 200.
-        max_gap   :  maximum gap to merge nearby peaks, default 100.
-        ptrack:  an optional track for pileup heights. If it's not None, use it to find summits. Otherwise, use self/scoreTrack.
-        """
-        cdef:
-            int32_t i
-            bytes chrom
-            np.ndarray pos, t1_vs_c1, t2_vs_c2, t1_vs_t2, \
-                       cond1_over_cond2, cond2_over_cond1, cond1_equal_cond2, \
-                       cond1_sig, cond2_sig,\
-                       cat1, cat2, cat3, \
-                       cat1_startpos, cat1_endpos, cat2_startpos, cat2_endpos, \
-                       cat3_startpos, cat3_endpos
-        chrs  = self.get_chr_names()
-        cat1_peaks = PeakIO()       # dictionary to save peaks significant at condition 1
-        cat2_peaks = PeakIO()       # dictionary to save peaks significant at condition 2
-        cat3_peaks = PeakIO()       # dictionary to save peaks significant in both conditions
-
-        self.cutoff = cutoff
-
-        for chrom in sorted(chrs):
-            pos = self.data[chrom][ 0 ]
-            t1_vs_c1 = self.data[chrom][ 1 ]
-            t2_vs_c2 = self.data[chrom][ 2 ]
-            t1_vs_t2 = self.data[chrom][ 3 ]
-            and_ = np.logical_and
-            cond1_over_cond2 = t1_vs_t2 >= cutoff # regions with stronger cond1 signals
-            cond2_over_cond1 = t1_vs_t2 <= -1*cutoff # regions with stronger cond2 signals
-            cond1_equal_cond2= and_( t1_vs_t2 >= -1*cutoff, t1_vs_t2 <= cutoff )
-            cond1_sig = t1_vs_c1 >= cutoff # enriched regions in condition 1
-            cond2_sig = t2_vs_c2 >= cutoff # enriched regions in condition 2
-            # indices where score is above cutoff
-            cat1 = np.where( and_( cond1_sig, cond1_over_cond2 ) )[ 0 ] # cond1 stronger than cond2, the indices
-            cat2 = np.where( and_( cond2_over_cond1, cond2_sig ) )[ 0 ] # cond2 stronger than cond1, the indices
-            cat3 = np.where( and_( and_( cond1_sig, cond2_sig ), # cond1 and cond2 are equal, the indices
-                                   cond1_equal_cond2 ) ) [ 0 ]
-
-            cat1_endpos = pos[cat1] # end positions of regions where score is above cutoff
-            cat1_startpos = pos[cat1-1] # start positions of regions where score is above cutoff
-            cat2_endpos = pos[cat2] # end positions of regions where score is above cutoff
-            cat2_startpos = pos[cat2-1] # start positions of regions where score is above cutoff
-            cat3_endpos = pos[cat3] # end positions of regions where score is above cutoff
-            cat3_startpos = pos[cat3-1] # start positions of regions where score is above cutoff
-
-            # for cat1: condition 1 stronger regions
-            self.__add_a_peak ( cat1_peaks, chrom, cat1, cat1_startpos, cat1_endpos, t1_vs_t2, max_gap, min_length )
-            # for cat2: condition 2 stronger regions
-            self.__add_a_peak ( cat2_peaks, chrom, cat2, cat2_startpos, cat2_endpos, -1 * t1_vs_t2, max_gap, min_length )
-            # for cat3: commonly strong regions
-            self.__add_a_peak ( cat3_peaks, chrom, cat3, cat3_startpos, cat3_endpos, abs(t1_vs_t2), max_gap, min_length )
-
-        return (cat1_peaks, cat2_peaks, cat3_peaks)
-
-    cdef object __add_a_peak ( self, object peaks, bytes chrom, np.ndarray indices, np.ndarray startpos, np.ndarray endpos,
-                               np.ndarray score, int32_t max_gap, int32_t min_length ):
-         """For a given chromosome, merge nearby significant regions,
-         filter out smaller regions, then add regions to PeakIO
-         object.
-
-         """
-         cdef:
-             int32_t i
-             list peak_content
-             float32_t mean_logLR
-
-         if startpos.size > 0:
-             # if it is not empty
-             peak_content = []
-             if indices[0] == 0:
-                 # first element > cutoff, fix the first point as 0. otherwise it would be the last item in data[chrom]['pos']
-                 startpos[0] = 0
-             # first bit of region above cutoff
-             peak_content.append( (startpos[0], endpos[0], score[indices[ 0 ]]) )
-             for i in range( 1, startpos.size ):
-                 if startpos[i] - peak_content[-1][1] <= max_gap:
-                     # append
-                     peak_content.append( ( startpos[i], endpos[i], score[indices[ i ]] ) )
-                 else:
-                     # close
-                     if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
-                         mean_logLR = self.mean_from_peakcontent( peak_content )
-                         #if peak_content[0][0] == 22414956:
-                         #    print(f"{peak_content} {mean_logLR}")
-                         peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
-                                    summit = -1, peak_score  = mean_logLR, pileup = 0, pscore = 0,
-                                    fold_change = 0, qscore = 0,
-                                    )
-                     peak_content = [(startpos[i], endpos[i], score[ indices[ i ] ]),]
-
-             # save the last peak
-             if peak_content:
-                 if peak_content[ -1 ][ 1 ] - peak_content[ 0 ][ 0 ] >= min_length:
-                     mean_logLR = self.mean_from_peakcontent( peak_content )
-                     peaks.add( chrom, peak_content[0][0], peak_content[-1][1],
-                                summit = -1, peak_score  = mean_logLR, pileup = 0, pscore = 0,
-                                fold_change = 0, qscore = 0,
-                                )
-
-         return
-
-    cdef float32_t mean_from_peakcontent ( self, list peakcontent ):
-        """
-
-        """
-        cdef:
-            int32_t tmp_s, tmp_e
-            int32_t l
-            float64_t tmp_v, sum_v        #for better precision
-            float32_t r
-            int32_t i
-
-        l = 0
-        sum_v = 0                         #initialize sum_v as 0
-        for i in range( len(peakcontent) ):
-            tmp_s = peakcontent[i][0]
-            tmp_e = peakcontent[i][1]
-            tmp_v = peakcontent[i][2]
-            sum_v += tmp_v * ( tmp_e - tmp_s )
-            l +=  tmp_e - tmp_s
-
-        r = <float32_t>( sum_v / l )
-        return r
-
-
-    cdef int64_t total ( self ):
-        """Return the number of regions in this object.
-
-        """
-        cdef:
-            int64_t t
-            bytes chrom
-
-        t = 0
-        for chrom in sorted(self.data.keys()):
-            t += self.datalength[chrom]
-        return t
-
-
diff --git a/MACS3/Signal/SignalProcessing.pyx b/MACS3/Signal/SignalProcessing.py
similarity index 61%
rename from MACS3/Signal/SignalProcessing.pyx
rename to MACS3/Signal/SignalProcessing.py
index 3a9a7220..5632e821 100644
--- a/MACS3/Signal/SignalProcessing.pyx
+++ b/MACS3/Signal/SignalProcessing.py
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-05-14 11:43:45 Tao Liu>
+# Time-stamp: <2024-10-15 11:25:35 Tao Liu>
 
 """Module Description: functions to find maxima minima or smooth the
 signal tracks.
@@ -20,39 +20,42 @@
 # ------------------------------------
 # smoothing function
 import numpy as np
-cimport numpy as np
-from numpy cimport uint8_t, uint16_t, uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t, float32_t, float64_t
-from cpython cimport bool
+import cython
+import cython.cimports.numpy as cnp
+from cython.cimports.cpython import bool
 
 
-cpdef np.ndarray[int32_t, ndim=1] maxima(np.ndarray[float32_t, ndim=1] signal,
-                                            int window_size=51):
+@cython.ccall
+def maxima(signal: cnp.ndarray(cython.float, ndim=1),
+           window_size: cython.int = 51) -> cnp.ndarray:
     """return the local maxima in a signal after applying a 2nd order
     Savitsky-Golay (polynomial) filter using window_size specified
     """
-    cdef:
-        np.ndarray[int32_t, ndim=1] m
-        np.ndarray[float64_t, ndim=1] smoothed
-        np.ndarray[float64_t, ndim=1] sign, diff
+    m: cnp.ndarray(cython.int, ndim=1)
+    smoothed: cnp.ndarray(cython.double, ndim=1)
+    sign: cnp.ndarray(cython.double, ndim=1)
+    diff: cnp.ndarray(cython.double, ndim=1)
 
-    window_size = window_size//2*2+1 # to make an odd number
+    window_size = window_size//2*2+1  # to make an odd number
     smoothed = savitzky_golay_order2_deriv1(signal, window_size).round(16)
-    sign = np.sign( smoothed )
-    diff = np.diff( sign )
-    m = np.where( diff <= -1)[0].astype("int32")
+    sign = np.sign(smoothed)
+    diff = np.diff(sign)
+    m = np.where(diff <= -1)[0].astype("i4")
     return m
 
-cdef np.ndarray[int32_t, ndim=1] internal_minima( np.ndarray[float32_t, ndim=1] signal,
-                                                  np.ndarray[int32_t, ndim=1] maxima ):
-    cdef:
-        np.ndarray[int32_t, ndim=1] ret
-        int32_t n = maxima.shape[0]
-        int32_t i, v, v2
+
+@cython.cfunc
+def internal_minima(signal: cnp.ndarray(cython.float, ndim=1),
+                    maxima: cnp.ndarray(cython.int, ndim=1)) -> cnp.ndarray:
+    ret: cnp.ndarray(cython.int, ndim=1)
+    n: cython.int = maxima.shape[0]
+    i: cython.int
+
     if n == 0 or n == 1:
-        ret = np.ndarray(0, 'int32')
+        ret = np.ndarray(0, 'i4')
         return ret
     else:
-        ret = np.zeros(n - 1, 'int32')
+        ret = np.zeros(n - 1, 'i4')
         pos1 = maxima[0]
         for i in range(n - 1):
             pos2 = maxima[i + 1]
@@ -60,38 +63,51 @@
             pos1 = pos2
         return ret
 
-cdef inline float32_t sqrt(float32_t threshold):
+
+@cython.cfunc
+@cython.inline
+def sqrt(threshold: cython.float) -> cython.float:
     return mathsqrt(threshold)
 
-cpdef enforce_peakyness(np.ndarray[float32_t, ndim=1] signal,
-                        np.ndarray[int32_t, ndim=1] maxima):
-    """requires peaks described by a signal and a set of points where the signal
-    is at a maximum to meet a certain set of criteria
+
+@cython.ccall
+def enforce_peakyness(signal: cnp.ndarray(cython.float, ndim=1),
+                      maxima: cnp.ndarray(cython.int, ndim=1)):
+    """requires peaks described by a signal and a set of points where
+    the signal is at a maximum to meet a certain set of criteria
 
     maxima which do not meet the required criteria are discarded
 
     criteria:
         for each peak:
-            calculate a threshold of the maximum of its adjacent two minima
-                plus the sqrt of that value
+
+            calculate a threshold of the maximum of its adjacent two
+            minima plus the sqrt of that value
+
             subtract the threshold from the region bounded by those minima
+
             clip that region if negative values occur inside it
+
             require it be > 50 bp in width -- controlled by is_valied_peak()
-            require that it not be too flat (< 6 unique values) -- controlled by is_valid_peak()
+
+            require that it not be too flat (< 6 unique values) --
+            controlled by is_valid_peak()
+
     """
-    cdef:
-        np.ndarray[int32_t, ndim=1] minima = internal_minima(signal, maxima)
-        np.ndarray[float32_t, ndim=1] new_signal
-        int32_t n = minima.shape[0]
-        float32_t threshold
-        np.ndarray[int32_t, ndim=1] peaky_maxima = maxima.copy()
-        int32_t j = 0
-    if n == 0: return maxima
-#    else:
+    minima: cnp.ndarray(cython.int, ndim=1) = internal_minima(signal, maxima)
+    new_signal: cnp.ndarray(cython.float, ndim=1)
+    n: cython.int = minima.shape[0]
+    threshold: cython.float
+    peaky_maxima: cnp.ndarray(cython.int, ndim=1) = maxima.copy()
+    j: cython.int = 0
+
+    if n == 0:
+        return maxima
+
     threshold = signal[minima[0]]
     threshold += sqrt(threshold)
     new_signal = signal[0:minima[0]] - threshold - sqrt(threshold)
-#    assert maxima[0] < minima[0], '%d > %d' % ( maxima[0], minima[0] )
+
     if is_valid_peak(new_signal, maxima[0]):
         peaky_maxima[0] = maxima[0]
         j += 1
@@ -103,7 +119,7 @@
         if is_valid_peak(new_signal, new_maximum):
             peaky_maxima[j] = maxima[i + 1]
             j += 1
-    threshold =  signal[minima[-1]]
+    threshold = signal[minima[-1]]
     threshold += sqrt(threshold)
     new_signal = signal[minima[-1]:] - threshold
     new_maximum = maxima[-1] - minima[-1]
@@ -113,11 +129,14 @@
     peaky_maxima.resize(j, refcheck=False)
     return peaky_maxima
 
+
 # hardcoded minimum peak width = 50
-cdef bool is_valid_peak(np.ndarray[float32_t, ndim=1] signal, int maximum):
-    cdef:
-        np.ndarray s
-        int32_t length
+@cython.cfunc
+def is_valid_peak(signal: cnp.ndarray(cython.float, ndim=1),
+                  maximum: cython.int) -> bool:
+    s: cnp.ndarray
+    length: cython.int
+
     s = hard_clip(signal, maximum)
     length = s.shape[0]
     if length < 50:
@@ -126,69 +145,84 @@
         return False
     return True
 
+
 # require at least 6 different float values -- prevents broad flat peaks
-cdef bool too_flat(np.ndarray[float32_t, ndim=1] signal):
+@cython.cfunc
+def too_flat(signal: cnp.ndarray(cython.float, ndim=1)) -> bool:
     """return whether signal has at least 6 unique values
     """
     return np.unique(signal).shape[0] < 6
 
+
 # hard clip a region with negative values
-cdef np.ndarray[float32_t, ndim=1] hard_clip(np.ndarray[float32_t, ndim=1] signal, int32_t maximum):
+@cython.cfunc
+def hard_clip(signal: cnp.ndarray(cython.float, ndim=1),
+              maximum: cython.int) -> cnp.ndarray:
     """clip the signal in both directions at the nearest values <= 0
     to position maximum
     """
-    cdef:
-        int32_t i
-        int32_t left = 0
-        int32_t right = signal.shape[0]
+    i: cython.int
+    left: cython.int = 0
+    right: cython.int = signal.shape[0]
+
     # clip left
-    for i in range( right - maximum, 0 ):
-        if signal[ -i ] < 0:
+    for i in range(right - maximum, 0):
+        if signal[-i] < 0:
             left = i
             break
     for i in range(maximum, right):
         if signal[i] < 0:
             right = i
             break
-    return signal[ left:right ]
+    return signal[left:right]
+
 
-cpdef np.ndarray[ int32_t, ndim=1 ] enforce_valleys(np.ndarray[ float32_t, ndim=1 ] signal,
-                                                    np.ndarray[ int32_t, ndim=1 ] summits,
-                                                    float32_t min_valley = 0.8 ):
+@cython.ccall
+def enforce_valleys(signal: cnp.ndarray(cython.float, ndim=1),
+                    summits: cnp.ndarray(cython.int, ndim=1),
+                    min_valley: cython.float = 0.8) -> cnp.ndarray:
     """require a value of <= min_valley * lower summit
     between each pair of summits
     """
-    cdef:
-        float32_t req_min, v, prev_v
-        int32_t summit_pos, prev_summit_pos
-        int32_t n_summits
-        int32_t n_valid_summits
-        np.ndarray[ int32_t, ndim=1 ] valid_summits
+    req_min: cython.float
+    v: cython.float
+    prev_v: cython.float
+
+    summit_pos: cython.int
+    prev_summit_pos: cython.int
+    n_summits: cython.int
+    n_valid_summits: cython.int
+
+    valid_summits: cnp.ndarray(cython.int, ndim=1)
+
     n_summits = summits.shape[0]
-    n_valid_summits = 1    
-    valid_summits = summits.copy()        
+    n_valid_summits = 1
+    valid_summits = summits.copy()
     # Remove peaks that do not have sufficient valleys
-    if n_summits == 1: return summits
-    for i in range( 1, n_summits ):
-        prev_summit_pos = valid_summits[ n_valid_summits-1 ]
-        summit_pos = summits[ i ]
-        prev_v = signal[ prev_summit_pos ]
-        v = signal[ summit_pos ]
-        req_min = min_valley * min( prev_v, v )
-        if ( signal[ prev_summit_pos:summit_pos ] < req_min ).any():
-            valid_summits[ n_valid_summits ] = summit_pos
+    if n_summits == 1:
+        return summits
+    for i in range(1, n_summits):
+        prev_summit_pos = valid_summits[n_valid_summits-1]
+        summit_pos = summits[i]
+        prev_v = signal[prev_summit_pos]
+        v = signal[summit_pos]
+        req_min = min_valley * min(prev_v, v)
+        if (signal[prev_summit_pos:summit_pos] < req_min).any():
+            valid_summits[n_valid_summits] = summit_pos
             n_valid_summits += 1
         elif v > prev_v:
-            valid_summits[ n_valid_summits-1 ] = summit_pos
-    valid_summits.resize( n_valid_summits, refcheck=False )
+            valid_summits[n_valid_summits-1] = summit_pos
+    valid_summits.resize(n_valid_summits, refcheck=False)
     return valid_summits
 
+
 # Modified from http://www.scipy.org/Cookbook/SavitzkyGolay
 # positive window_size not enforced anymore
 # needs sane input paramters, window size > 4
 # switched to double precision for internal accuracy
-cpdef np.ndarray[float64_t, ndim=1] savitzky_golay_order2_deriv1(np.ndarray[float32_t, ndim=1] signal,
-                                                                 int32_t window_size):
+@cython.ccall
+def savitzky_golay_order2_deriv1(signal: cnp.ndarray(cython.float, ndim=1),
+                                 window_size: cython.int) -> cnp.ndarray:
     """Smooth (and optionally differentiate) data with a Savitzky-Golay filter.
     The Savitzky-Golay filter removes high frequency noise from data.
     It has the advantage of preserving the original shape and
@@ -223,31 +257,40 @@
        W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
        Cambridge University Press ISBN-13: 9780521880688
     """
-    cdef:
-        int32_t half_window, k
-        np.ndarray[int64_t, ndim=2] b
-        # pad the signal at the extremes with
-        # values taken from the signal itself
-        np.ndarray[float32_t, ndim=1] firstvals, lastvals
-        np.ndarray[float64_t, ndim=1] m, ret
-
-    if window_size % 2 != 1: window_size += 1
+    half_window: cython.int
+    b: cnp.ndarray(cython.long, ndim=2)
+    # pad the signal at the extremes with
+    # values taken from the signal itself
+    firstvals: cnp.ndarray(cython.float, ndim=1)
+    lastvals: cnp.ndarray(cython.float, ndim=1)
+    m: cnp.ndarray(cython.double, ndim=1)
+    ret: cnp.ndarray(cython.double, ndim=1)
+
+    if window_size % 2 != 1:
+        window_size += 1
     half_window = (window_size - 1) // 2
     # precompute coefficients
     b = np.array([[1, k, k**2] for k in range(-half_window, half_window+1)],
-                 dtype='int64')
+                 dtype='i8')
     m = np.linalg.pinv(b)[1]
     # pad the signal at the extremes with
     # values taken from the signal itself
     firstvals = signal[0] - np.abs(signal[1:half_window+1][::-1] - signal[0])
     lastvals = signal[-1] + np.abs(signal[-half_window-1:-1][::-1] - signal[-1])
     signal = np.concatenate((firstvals, signal, lastvals))
-    ret = np.convolve( m[::-1], signal.astype("float64"), mode='valid') #.astype("float32").round(8) # round to 8 decimals to avoid signing issue
+    ret = np.convolve(m[::-1],
+                      signal.astype("f8"),
+                      mode='valid')
     return ret
 
+
 # Another modified version from http://www.scipy.org/Cookbook/SavitzkyGolay
-cpdef np.ndarray[float32_t, ndim=1] savitzky_golay( np.ndarray[float32_t, ndim=1] y, int32_t window_size,
-                                                    int32_t order, int32_t deriv = 0, int32_t rate = 1 ):
+@cython.ccall
+def savitzky_golay(y: cnp.ndarray(cython.float, ndim=1),
+                   window_size: cython.int,
+                   order: cython.int,
+                   deriv: cython.int = 0,
+                   rate: cython.int = 1) -> cnp.ndarray:
     """Smooth (and optionally differentiate) data with a Savitzky-Golay filter.
     The Savitzky-Golay filter removes high frequency noise from data.
     It has the advantage of preserving the original shape and
@@ -278,7 +321,7 @@
     Examples
     --------
     t = np.linspace(-4, 4, 500)
-    y = np.exp( -t**2 ) + np.random.normal(0, 0.05, t.shape)
+    y = np.exp(-t**2) + np.random.normal(0, 0.05, t.shape)
     ysg = savitzky_golay(y, window_size=31, order=4)
     import matplotlib.pyplot as plt
     plt.plot(t, y, label='Noisy signal')
@@ -295,31 +338,34 @@
        W.H. Press, S.A. Teukolsky, W.T. Vetterling, B.P. Flannery
        Cambridge University Press ISBN-13: 9780521880688
     """
-    cdef:
-        int32_t half_window, k
-        np.ndarray[int64_t, ndim=2] b
-        # pad the signal at the extremes with
-        # values taken from the signal itself
-        np.ndarray[float32_t, ndim=1] firstvals, lastvals, ret
-        np.ndarray[float64_t, ndim=1] m
+    half_window: cython.int
+    b: cnp.ndarray(cython.long, ndim=2)
+    # pad the signal at the extremes with
+    # values taken from the signal itself
+    firstvals: cnp.ndarray(cython.float, ndim=1)
+    lastvals: cnp.ndarray(cython.float, ndim=1)
+    ret: cnp.ndarray(cython.float, ndim=1)
+    m: cnp.ndarray(cython.double, ndim=1)
 
     try:
-        window_size = np.abs( np.int( window_size ) )
-        order = np.abs( np.int( order ) )
-    except ValueError, msg:
+        window_size = np.abs(np.int(window_size))
+        order = np.abs(np.int(order))
+    except ValueError:
         raise ValueError("window_size and order have to be of type int")
     if window_size % 2 != 1 or window_size < 1:
         raise TypeError("window_size size must be a positive odd number")
     if window_size < order + 2:
         raise TypeError("window_size is too small for the polynomials order")
-    half_window = ( window_size -1 ) // 2
+    half_window = (window_size - 1) // 2
     # precompute coefficients
-    b = np.array( [ [ k**i for i in range( order + 1 ) ] for k in range( -half_window, half_window+1 ) ] )
-    m = np.linalg.pinv( b )[ deriv ] * rate**deriv * mathfactorial( deriv )
+    b = np.array([[k**i
+                   for i in range(order + 1)]
+                  for k in range(-half_window, half_window+1)])
+    m = np.linalg.pinv(b)[deriv] * rate**deriv * mathfactorial(deriv)
     # pad the signal at the extremes with
     # values taken from the signal itself
-    firstvals = y[ 0 ] - np.abs( y[ 1:half_window + 1 ][ ::-1 ] - y[ 0 ] )
-    lastvals = y[ -1 ] + np.abs( y[ -half_window - 1:-1 ][ ::-1 ] - y[ -1 ])
-    y = np.concatenate( ( firstvals, y, lastvals ) )
-    ret = np.convolve( m[ ::-1 ], y, mode = 'valid' ).astype("float32")
+    firstvals = y[0] - np.abs(y[1:half_window + 1][::-1] - y[0])
+    lastvals = y[-1] + np.abs(y[-half_window - 1:-1][::-1] - y[-1])
+    y = np.concatenate((firstvals, y, lastvals))
+    ret = np.convolve(m[::-1], y, mode='valid').astype("float32")
     return ret
diff --git a/MACS3/Signal/UnitigRACollection.py b/MACS3/Signal/UnitigRACollection.py
new file mode 100644
index 00000000..0c3f31e8
--- /dev/null
+++ b/MACS3/Signal/UnitigRACollection.py
@@ -0,0 +1,324 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-22 17:14:11 Tao Liu>
+
+"""Module
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+"""
+# ------------------------------------
+# python modules
+# ------------------------------------
+from operator import itemgetter
+
+from MACS3.Signal.ReadAlignment import ReadAlignment
+from MACS3.Signal.PosReadsInfo import PosReadsInfo
+from MACS3.IO.PeakIO import PeakIO
+
+import cython
+from cython.cimports.cpython import bool
+
+# ------------------------------------
+# constants
+# ------------------------------------
+__version__ = "Parser $Revision$"
+__author__ = "Tao Liu <tliu4@buffalo.edu>"
+__doc__ = "All Parser classes"
+
+__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
+
+__CIGARCODE__ = "MIDNSHP=X"
+
+# ------------------------------------
+# Misc functions
+# ------------------------------------
+
+# ------------------------------------
+# Classes
+# ------------------------------------
+
+
+@cython.cclass
+class UnitigRAs:
+    """
+    """
+    RAlists: list            # [RAlists_T, RAlists_C]
+    seq: bytes
+    unitig_aln: bytes
+    reference_aln: bytes
+    chrom: bytes
+    lpos: cython.long
+    rpos: cython.long
+    unitig_length: cython.long
+    reference_length: cython.long
+    aln_length: cython.long
+
+    def __init__(self,
+                 chrom: bytes,
+                 lpos: cython.long,
+                 rpos: cython.long,
+                 unitig_aln: bytes,
+                 reference_aln: bytes,
+                 RAlists: list):
+        assert len(unitig_aln) == len(reference_aln), Exception("aln on unitig and reference should be the same length!")
+        self.chrom = chrom
+        self.lpos = lpos
+        self.rpos = rpos
+        self.unitig_aln = unitig_aln
+        self.reference_aln = reference_aln
+        self.RAlists = RAlists
+        # fill in other information
+        self.seq = self.unitig_aln.replace(b'-', b'')
+        self.unitig_length = len(self.seq)
+        self.reference_length = rpos - lpos
+        self.aln_length = len(unitig_aln)
+
+    def __getitem__(self, keyname):
+        if keyname == "chrom":
+            return self.chrom
+        elif keyname == "lpos":
+            return self.lpos
+        elif keyname == "rpos":
+            return self.rpos
+        elif keyname == "seq":
+            return self.seq
+        elif keyname == "unitig_aln":
+            return self.unitig_aln
+        elif keyname == "reference_aln":
+            return self.reference_aln
+        elif keyname == "unitig_length":
+            return self.unitig_length
+        elif keyname == "reference_length":
+            return self.reference_length
+        elif keyname == "aln_length":
+            return self.aln_length
+        elif keyname == "count":
+            return len(self.RAlists[0]) + len(self.RAlists[1])
+        else:
+            raise KeyError("Unavailable key:", keyname)
+
+    def __getstate__(self):
+        return (self.RAlists, self.seq, self.unitig_aln, self.reference_aln,
+                self.chrom, self.lpos, self.rpos, self.unitig_length,
+                self.reference_length, self.aln_length)
+
+    def __setstate__(self, state):
+        (self.RAlists, self.seq, self.unitig_aln, self.reference_aln,
+         self.chrom, self.lpos, self.rpos, self.unitig_length,
+         self.reference_length, self.aln_length) = state
+
+    @cython.ccall
+    def get_variant_bq_by_ref_pos(self,
+                                  ref_pos: cython.long) -> tuple:
+        """
+        return (s, bq_list_t, bq_list_c, strand_list_t, strand_list_c)
+        """
+        i: cython.long
+        index_aln: cython.long
+        index_unitig: cython.long
+        residue: cython.long
+        ra: ReadAlignment
+        s: bytes
+        bq_list_t: list = []
+        bq_list_c: list = []
+        strand_list_t: list = []
+        strand_list_c: list = []
+        tip_list_t: list = []
+        pos_list_t: list = []
+        pos_list_c: list = []
+        ra_seq: bytes
+        ra_pos: cython.long
+        l_read: cython.int
+
+        #  b'TTATTAGAAAAAAT' find = 2
+        #          b'AAAAATCCCACAGG'
+        # b'TTTTATTAGAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT'
+        # b'TTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' lpos=100
+        #          |    |       |
+        # genome   108  113     120
+        # aln       8    13      21
+        # unitig    8    13      21
+        # ref       8    13      20
+        # read1     6    11
+        # read2           3      11
+        # find the position
+        residue = ref_pos - self.lpos + 1
+        index_aln = 0
+        for i in range(self.aln_length):
+            if self.reference_aln[i] != 45:  # 45 means b'-'
+                residue -= 1
+            if residue == 0:
+                break
+            index_aln += 1
+
+        # index_aln should be the position on aln
+        s = self.unitig_aln[index_aln:index_aln+1]
+        # find the index on unitig
+        index_unitig = len(self.unitig_aln[:index_aln+1].replace(b'-', b''))
+
+        if s == b'-':                     # deletion
+            for ra in self.RAlists[0]:
+                ra_seq = ra["SEQ"]
+                l_read = ra["l"]
+                ra_pos = index_unitig - self.seq.find(ra_seq) - 1
+                if ra_pos == 0 or ra_pos == l_read - 1:
+                    tip_list_t.append(True)
+                else:
+                    tip_list_t.append(False)
+                bq_list_t.append(93)
+                strand_list_t.append(ra["strand"])
+                pos_list_t.append(ra_pos)
+            for ra in self.RAlists[1]:
+                ra_seq = ra["SEQ"]
+                ra_pos = index_unitig - self.seq.find(ra_seq) - 1
+                bq_list_c.append(93)
+                strand_list_c.append(ra["strand"])
+                pos_list_c.append(ra_pos)
+            return (bytearray(b'*'), bq_list_t, bq_list_c, strand_list_t,
+                    strand_list_c, tip_list_t, pos_list_t, pos_list_c)
+
+        if index_aln < self.aln_length - 1:
+            for i in range(index_aln + 1, self.aln_length):
+                if self.reference_aln[i] == 45:   # insertion detected, 45 means b'-'
+                    s += self.unitig_aln[i:i+1]  # we extend the s string to contain the inserted seq
+                else:
+                    break
+
+        for ra in self.RAlists[0]:        # treatment
+            ra_seq = ra["SEQ"]
+            l_read = ra["l"]
+            ra_pos = index_unitig - self.seq.find(ra_seq) - 1
+            if ra_pos < l_read and ra_pos >= 0:
+                pos_list_t.append(ra_pos)
+                if ra_pos == 0 or ra_pos == l_read - 1:
+                    tip_list_t.append(True)
+                else:
+                    tip_list_t.append(False)
+                bq_list_t.append(ra["binaryqual"][ra_pos])
+                strand_list_t.append(ra["strand"])
+
+        for ra in self.RAlists[1]:        # control
+            ra_seq = ra["SEQ"]
+            l_read = ra["l"]
+            ra_pos = index_unitig - self.seq.find(ra_seq) - 1
+            if ra_pos < l_read and ra_pos >= 0:
+                pos_list_c.append(ra_pos)
+                bq_list_c.append(ra["binaryqual"][ra_pos])
+                strand_list_c.append(ra["strand"])
+
+        return (bytearray(s), bq_list_t, bq_list_c, strand_list_t,
+                strand_list_c, tip_list_t, pos_list_t, pos_list_c)
+
+
+@cython.cclass
+class UnitigCollection:
+    """A collection of ReadAlignment objects and the corresponding
+    PeakIO.
+
+    """
+    chrom: bytes
+    peak: PeakIO             # A PeakIO object
+    URAs_list: list
+    left: cython.long               # left position of peak
+    right: cython.long              # right position of peak
+    length: cython.long             # length of peak
+    URAs_left: cython.long          # left position of all RAs in the collection
+    URAs_right: cython.long         # right position of all RAs in the collection
+    is_sorted: bool             # if sorted by lpos
+
+    def __init__(self,
+                 chrom: bytes,
+                 peak: PeakIO,
+                 URAs_list: list = []):
+        self.chrom = chrom
+        self.peak = peak
+        self.URAs_list = URAs_list
+        self.left = peak["start"]
+        self.right = peak["end"]
+        self.length = self.right - self.left
+        self.URAs_left = URAs_list[0]["lpos"]  # initial assignment of RAs_left
+        self.URAs_right = URAs_list[-1]["rpos"]  # initial assignment of RAs_right
+        self.sort()                           # it will set self.is_sorted = True
+        # check RAs_left and RAs_right
+        for ura in URAs_list:
+            if ura["lpos"] < self.URAs_left:
+                self.URAs_left = ura["lpos"]
+            if ura["rpos"] > self.URAs_right:
+                self.URAs_right = ura["rpos"]
+
+    def __getitem__(self, keyname):
+        if keyname == "chrom":
+            return self.chrom
+        elif keyname == "left":
+            return self.left
+        elif keyname == "right":
+            return self.right
+        elif keyname == "URAs_left":
+            return self.URAs_left
+        elif keyname == "URAs_right":
+            return self.URAs_right
+        elif keyname == "length":
+            return self.length
+        elif keyname == "count":
+            return len(self.URAs_list)
+        elif keyname == "URAs_list":
+            return self.URAs_list
+        else:
+            raise KeyError("Unavailable key:", keyname)
+
+    def __getstate__(self):
+        return (self.chrom, self.peak, self.URAs_list, self.left, self.right,
+                self.length, self.URAs_left, self.URAs_right, self.is_sorted)
+
+    def __setstate__(self, state):
+        (self.chrom, self.peak, self.URAs_list, self.left, self.right,
+         self.length, self.URAs_left, self.URAs_right, self.is_sorted) = state
+
+    @cython.ccall
+    def sort(self):
+        """Sort RAs according to lpos. Should be used after realignment.
+
+        """
+        self.URAs_list.sort(key=itemgetter("lpos"))
+        self.is_sorted = True
+        return
+
+    @cython.ccall
+    def get_PosReadsInfo_ref_pos(self,
+                                 ref_pos: cython.long,
+                                 ref_nt: bytes,
+                                 Q: cython.int = 20):
+        """Generate a PosReadsInfo for: object a given reference genome
+        position.
+
+        Return a PosReadsInfo object.
+
+        """
+        s: bytearray
+        bq_list_t: list
+        bq_list_c: list
+        strand_list_t: list
+        strand_list_c: list
+        tip_list_t: list
+        pos_list_t: list
+        pos_list_c: list
+        ura: object
+        i: cython.int
+        posreadsinfo_p: PosReadsInfo
+
+        posreadsinfo_p = PosReadsInfo(ref_pos, ref_nt)
+        for i in range(len(self.URAs_list)):
+            ura = self.URAs_list[i]
+            if ura["lpos"] <= ref_pos and ura["rpos"] > ref_pos:
+                (s, bq_list_t, bq_list_c, strand_list_t, strand_list_c,
+                 tip_list_t, pos_list_t, pos_list_c) = ura.get_variant_bq_by_ref_pos(ref_pos)
+                for i in range(len(bq_list_t)):
+                    posreadsinfo_p.add_T(i, bytes(s), bq_list_t[i],
+                                         strand_list_t[i], tip_list_t[i], Q=Q)
+                for i in range(len(bq_list_c)):
+                    posreadsinfo_p.add_C(i, bytes(s), bq_list_c[i],
+                                         strand_list_c[i], Q=Q)
+
+        return posreadsinfo_p
diff --git a/MACS3/Signal/UnitigRACollection.pyx b/MACS3/Signal/UnitigRACollection.pyx
deleted file mode 100644
index 33051ffd..00000000
--- a/MACS3/Signal/UnitigRACollection.pyx
+++ /dev/null
@@ -1,309 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2022-02-18 11:44:57 Tao Liu>
-
-"""Module for SAPPER BAMParser class
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included
-with the distribution).
-"""
-# ------------------------------------
-# python modules
-# ------------------------------------
-from collections import Counter
-from operator import itemgetter
-from copy import copy
-
-from MACS3.Signal.ReadAlignment import ReadAlignment
-from MACS3.Signal.PosReadsInfo import PosReadsInfo
-from MACS3.IO.PeakIO import PeakIO
-
-from cpython cimport bool
-
-import numpy as np
-cimport numpy as np
-from numpy cimport uint32_t, uint64_t, int32_t, int64_t
-
-cdef extern from "stdlib.h":
-    ctypedef unsigned int size_t
-    size_t strlen(char *s)
-    void *malloc(size_t size)
-    void *calloc(size_t n, size_t size)
-    void free(void *ptr)
-    int strcmp(char *a, char *b)
-    char * strcpy(char *a, char *b)
-    long atol(char *bytes)
-    int atoi(char *bytes)
-    
-# ------------------------------------
-# constants
-# ------------------------------------
-__version__ = "Parser $Revision$"
-__author__ = "Tao Liu <tliu4@buffalo.edu>"
-__doc__ = "All Parser classes"
-
-__DNACOMPLEMENT__ = b'\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@TBGDEFCHIJKLMNOPQRSAUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff' # A trans table to convert A to T, C to G, G to C, and T to A.
-
-__CIGARCODE__ = "MIDNSHP=X"
-
-# ------------------------------------
-# Misc functions
-# ------------------------------------
-
-# ------------------------------------
-# Classes
-# ------------------------------------
-
-cdef class UnitigRAs:
-    """
-    """
-    cdef:
-        list RAlists            # [RAlists_T, RAlists_C]
-        bytes seq
-        bytes unitig_aln
-        bytes reference_aln
-        bytes chrom
-        long lpos
-        long rpos
-        long unitig_length
-        long reference_length
-        long aln_length
-
-    def __init__ ( self, bytes chrom, long lpos, long rpos, bytes unitig_aln, bytes reference_aln, list RAlists ):
-        assert len( unitig_aln )==len( reference_aln ), Exception("aln on unitig and reference should be the same length!")
-        self.chrom = chrom
-        self.lpos = lpos
-        self.rpos = rpos
-        self.unitig_aln = unitig_aln
-        self.reference_aln = reference_aln
-        self.RAlists = RAlists
-        # fill in other information
-        self.seq = self.unitig_aln.replace(b'-',b'')
-        self.unitig_length = len( self.seq )
-        self.reference_length = rpos - lpos
-        self.aln_length = len( unitig_aln )
-
-    def __getitem__ ( self, keyname ):
-        if keyname == "chrom":
-            return self.chrom
-        elif keyname == "lpos":
-            return self.lpos
-        elif keyname == "rpos":
-            return self.rpos
-        elif keyname == "seq":
-            return self.seq
-        elif keyname == "unitig_aln":
-            return self.unitig_aln
-        elif keyname == "reference_aln":
-            return self.reference_aln	    
-        elif keyname == "unitig_length":
-            return self.unitig_length
-        elif keyname == "reference_length":
-            return self.reference_length
-        elif keyname == "aln_length":
-            return self.aln_length
-        elif keyname == "count":
-            return len( self.RAlists[0] ) + len( self.RAlists[1] )
-        else:
-            raise KeyError("Unavailable key:", keyname)
-
-    def __getstate__ ( self ):
-        return (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, self.chrom, self.lpos, self.rpos, self.unitig_length, self.reference_length, self.aln_length )
-        
-    def __setstate__ ( self, state ):
-        (self.RAlists, self.seq, self.unitig_aln, self.reference_aln, self.chrom, self.lpos, self.rpos, self.unitig_length, self.reference_length, self.aln_length ) = state
-
-
-    cpdef tuple get_variant_bq_by_ref_pos( self, long ref_pos ):
-        """
-        
-        return ( s, bq_list_t, bq_list_c, strand_list_t, strand_list_c ) 
-        """
-        cdef:
-            long i
-            long index_aln
-            long index_unitig
-            long residue
-            object ra
-            bytes s
-            list bq_list_t = []
-            list bq_list_c = []
-            list strand_list_t = []
-            list strand_list_c = []
-            list tip_list_t = []
-            list pos_list_t = []
-            list pos_list_c = []
-            bytes ra_seq
-            long ra_pos
-            int p_seq
-            int l_read
-
-        #  b'TTATTAGAAAAAAT' find = 2
-        #          b'AAAAATCCCACAGG'
-        #b'TTTTATTAGAAAAAATCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT'
-        #b'TTTTATTACAAAAA-TCCCACAGGCAGCCACTAGGTGGCAGTAACAGGCTTTTGCCAGCGGCTCCAGTCAGCATGGCTTGACTGTGTGCT' lpos=100
-        #          |    |       |
-        #genome   108  113     120
-        #aln       8    13      21
-        #unitig    8    13      21
-        #ref       8    13      20
-        #read1     6    11
-        #read2           3      11
-        # find the position
-        residue = ref_pos - self.lpos + 1
-        index_aln  = 0
-        for i in range( self.aln_length ):
-            if self.reference_aln[ i ] != 45: # 45 means b'-'
-                residue -= 1
-            if residue == 0:
-                break
-            index_aln += 1
-
-        # index_aln should be the position on aln
-        s = self.unitig_aln[ index_aln:index_aln+1 ]
-        # find the index on unitig
-        index_unitig = len( self.unitig_aln[:index_aln+1].replace(b'-',b'') )
-
-        if s == b'-':                     #deletion
-            for ra in self.RAlists[ 0 ]:
-                ra_seq = ra["SEQ"]
-                l_read = ra["l"]
-                ra_pos = index_unitig - self.seq.find( ra_seq ) - 1
-                if ra_pos == 0 or ra_pos == l_read -1:
-                    tip_list_t.append( True )
-                else:
-                    tip_list_t.append( False )
-                bq_list_t.append(  93 )
-                strand_list_t.append( ra["strand"] )
-                pos_list_t.append( ra_pos )
-            for ra in self.RAlists[ 1 ]:
-                ra_seq = ra["SEQ"]
-                ra_pos = index_unitig - self.seq.find( ra_seq ) - 1
-                bq_list_c.append(  93 )
-                strand_list_c.append( ra["strand"] )
-                pos_list_c.append( ra_pos )
-            return ( bytearray(b'*'), bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c )
-
-        if index_aln < self.aln_length - 1:
-            for i in range( index_aln + 1, self.aln_length ):
-                if self.reference_aln[ i ] == 45:   #insertion detected, 45 means b'-'
-                    s += self.unitig_aln[ i:i+1 ] # we extend the s string to contain the inserted seq
-                else:
-                    break
-
-        for ra in self.RAlists[0]:        #treatment
-            ra_seq = ra["SEQ"]
-            l_read = ra["l"]
-            ra_pos = index_unitig - self.seq.find( ra_seq ) - 1
-            if ra_pos < l_read and ra_pos >= 0:
-                pos_list_t.append( ra_pos )
-                if ra_pos == 0 or ra_pos == l_read -1:
-                    tip_list_t.append( True )
-                else:
-                    tip_list_t.append( False )
-                bq_list_t.append( ra["binaryqual"][ra_pos] )
-                strand_list_t.append( ra["strand"] )
-
-        for ra in self.RAlists[1]:        #control
-            ra_seq = ra["SEQ"]
-            l_read = ra["l"]
-            ra_pos = index_unitig - self.seq.find( ra_seq ) - 1
-            if ra_pos < l_read and ra_pos >= 0:
-                pos_list_c.append( ra_pos )
-                bq_list_c.append( ra["binaryqual"][ra_pos] )                
-                strand_list_c.append( ra["strand"] )
-
-        return (bytearray(s), bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c )
-
-cdef class UnitigCollection:
-    """A collection of ReadAlignment objects and the corresponding
-    PeakIO.
-
-    """
-    cdef:
-        bytes chrom
-        object peak             # A PeakIO object
-        list URAs_list
-        long left               # left position of peak
-        long right              # right position of peak
-        long length             # length of peak
-        long URAs_left          # left position of all RAs in the collection
-        long URAs_right         # right position of all RAs in the collection
-        bool sorted             # if sorted by lpos
-
-    def __init__ ( self, chrom, peak, URAs_list=[] ):
-        self.chrom = chrom
-        self.peak = peak
-        self.URAs_list = URAs_list
-        self.left = peak["start"]
-        self.right = peak["end"]
-        self.length =  self.right - self.left
-        self.URAs_left = URAs_list[ 0 ]["lpos"] # initial assignment of RAs_left
-        self.URAs_right = URAs_list[-1]["rpos"] # initial assignment of RAs_right
-        self.sort()                           # it will set self.sorted = True
-        # check RAs_left and RAs_right
-        for ura in URAs_list:
-            if ura[ "lpos" ] < self.URAs_left:
-                self.URAs_left = ura[ "lpos" ]
-            if ura[ "rpos" ] > self.URAs_right:
-                self.URAs_right = ura[ "rpos" ]
-
-    def __getitem__ ( self, keyname ):
-        if keyname == "chrom":
-            return self.chrom
-        elif keyname == "left":
-            return self.left
-        elif keyname == "right":
-            return self.right
-        elif keyname == "URAs_left":
-            return self.URAs_left
-        elif keyname == "URAs_right":
-            return self.URAs_right
-        elif keyname == "length":
-            return self.length
-        elif keyname == "count":
-            return len( self.URAs_list )
-        elif keyname == "URAs_list":
-            return self.URAs_list
-        else:
-            raise KeyError("Unavailable key:", keyname)
-
-    def __getstate__ ( self ):
-        return (self.chrom, self.peak, self.URAs_list, self.left, self.right, self.length, self.URAs_left, self.URAs_right, self.sorted)
-        
-    def __setstate__ ( self, state ):
-        (self.chrom, self.peak, self.URAs_list, self.left, self.right, self.length, self.URAs_left, self.URAs_right, self.sorted) = state
-        
-    cpdef sort ( self ):
-        """Sort RAs according to lpos. Should be used after realignment.
-
-        """
-        self.URAs_list.sort(key=itemgetter("lpos"))
-        self.sorted = True
-        return
-        
-    cpdef object get_PosReadsInfo_ref_pos ( self, long ref_pos, bytes ref_nt, int Q=20 ):
-        """Generate a PosReadsInfo object for a given reference genome
-        position.
-
-        Return a PosReadsInfo object.
-
-        """
-        cdef:
-            bytearray s, bq
-            list bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c
-            object ura
-            int i
-
-        posreadsinfo_p = PosReadsInfo( ref_pos, ref_nt )
-        for i in range( len( self.URAs_list ) ):
-            ura = self.URAs_list[ i ]
-            if ura[ "lpos" ] <= ref_pos and ura[ "rpos" ] > ref_pos:
-                ( s, bq_list_t, bq_list_c, strand_list_t, strand_list_c, tip_list_t, pos_list_t, pos_list_c ) = ura.get_variant_bq_by_ref_pos( ref_pos )
-                for i in range( len(bq_list_t) ):
-                    posreadsinfo_p.add_T( i, bytes(s), bq_list_t[i], strand_list_t[i], tip_list_t[i], Q=Q )
-                for i in range( len(bq_list_c) ):
-                    posreadsinfo_p.add_C( i, bytes(s), bq_list_c[i], strand_list_c[i], Q=Q )
-
-        return posreadsinfo_p
diff --git a/MACS3/Signal/VariantStat.py b/MACS3/Signal/VariantStat.py
new file mode 100644
index 00000000..b154a03d
--- /dev/null
+++ b/MACS3/Signal/VariantStat.py
@@ -0,0 +1,549 @@
+# cython: language_level=3
+# cython: profile=True
+# Time-stamp: <2024-10-22 14:47:22 Tao Liu>
+
+"""Module for SAPPER BAMParser class
+
+Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
+
+This code is free software; you can redistribute it and/or modify it
+under the terms of the BSD License (see the file COPYING included
+with the distribution).
+
+@status:  experimental
+@version: $Revision$
+@author:  Tao Liu
+@contact: tliu4@buffalo.edu
+"""
+
+# ------------------------------------
+# python modules
+# ------------------------------------
+import cython
+
+import cython.cimports.numpy as cnp
+from cython.cimports.cpython import bool
+
+from math import log1p, exp, log
+
+LN10 = 2.3025850929940458
+LN10_tenth = 0.23025850929940458
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.ccall
+def CalModel_Homo(top1_bq_T: cnp.ndarray(cython.int, ndim=1),
+                  top1_bq_C: cnp.ndarray(cython.int, ndim=1),
+                  top2_bq_T: cnp.ndarray(cython.int, ndim=1),
+                  top2_bq_C: cnp.ndarray(cython.int, ndim=1)) -> tuple:
+    """Return (lnL, BIC).
+
+    """
+    i: cython.int
+    lnL: cython.double
+    BIC: cython.double
+
+    lnL = 0
+    # Phred score is Phred = -10log_{10} E, where E is the error rate.
+    # to get the 1-E: 1-E = 1-exp(Phred/-10*M_LN10) = 1-exp(Phred * -LOG10_E_tenth)
+    for i in range(top1_bq_T.shape[0]):
+        lnL += log1p(-exp(-top1_bq_T[i]*LN10_tenth))
+    for i in range(top1_bq_C.shape[0]):
+        lnL += log1p(-exp(-top1_bq_C[i]*LN10_tenth))
+
+    for i in range(top2_bq_T.shape[0]):
+        lnL += log(exp(-top2_bq_T[i]*LN10_tenth))
+    for i in range(top2_bq_C.shape[0]):
+        lnL += log(exp(-top2_bq_C[i]*LN10_tenth))
+
+    BIC = -2*lnL                # no free variable, no penalty
+    return (lnL, BIC)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.ccall
+def CalModel_Heter_noAS(top1_bq_T: cnp.ndarray(cython.int, ndim=1),
+                        top1_bq_C: cnp.ndarray(cython.int, ndim=1),
+                        top2_bq_T: cnp.ndarray(cython.int, ndim=1),
+                        top2_bq_C: cnp.ndarray(cython.int, ndim=1)) -> tuple:
+    """Return (lnL, BIC)
+
+    k_T
+    k_C
+    """
+    k_T: cython.int
+    k_C: cython.int
+    lnL: cython.double
+    BIC: cython.double
+    tn_T: cython.int
+    tn_C: cython.int
+    # tn: cython.int  # total observed NTs
+    lnL_T: cython.double
+    lnL_C: cython.double    # log likelihood for treatment and control
+
+    lnL = 0
+    BIC = 0
+    # for k_T
+    # total oberseved treatment reads from top1 and top2 NTs
+    tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
+
+    if tn_T == 0:
+        raise Exception("Total number of treatment reads is 0!")
+    else:
+        (lnL_T, k_T) = GreedyMaxFunctionNoAS(top1_bq_T.shape[0],
+                                             top2_bq_T.shape[0],
+                                             tn_T,
+                                             top1_bq_T,
+                                             top2_bq_T)
+        lnL += lnL_T
+        BIC += -2*lnL_T
+
+    # for k_C
+    tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
+
+    if tn_C == 0:
+        pass
+    else:
+        (lnL_C, k_C) = GreedyMaxFunctionNoAS(top1_bq_C.shape[0],
+                                             top2_bq_C.shape[0],
+                                             tn_C,
+                                             top1_bq_C,
+                                             top2_bq_C)
+        lnL += lnL_C
+        BIC += -2*lnL_C
+
+    # tn = tn_C + tn_T
+
+    # we penalize big model depending on the number of reads/samples
+    if tn_T == 0:
+        BIC += log(tn_C)
+    elif tn_C == 0:
+        BIC += log(tn_T)
+    else:
+        BIC += log(tn_T) + log(tn_C)
+
+    return (lnL, BIC)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.ccall
+def CalModel_Heter_AS(top1_bq_T: cnp.ndarray(cython.int, ndim=1),
+                      top1_bq_C: cnp.ndarray(cython.int, ndim=1),
+                      top2_bq_T: cnp.ndarray(cython.int, ndim=1),
+                      top2_bq_C: cnp.ndarray(cython.int, ndim=1),
+                      max_allowed_ar: cython.float = 0.99) -> tuple:
+    """Return (lnL, BIC)
+
+    kc
+    ki
+    AS_alleleratio
+    """
+    k_T: cython.int
+    k_C: cython.int
+    lnL: cython.double
+    BIC: cython.double
+    tn_T: cython.int
+    tn_C: cython.int
+    # tn: cython.int  # total observed NTs
+    lnL_T: cython.double
+    lnL_C: cython.double    # log likelihood for treatment and control
+    AS_alleleratio: cython.double   # allele ratio
+
+    lnL = 0
+    BIC = 0
+
+    # Treatment
+    tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
+
+    if tn_T == 0:
+        raise Exception("Total number of treatment reads is 0!")
+    else:
+        (lnL_T, k_T, AS_alleleratio) = GreedyMaxFunctionAS(top1_bq_T.shape[0],
+                                                           top2_bq_T.shape[0],
+                                                           tn_T,
+                                                           top1_bq_T,
+                                                           top2_bq_T,
+                                                           max_allowed_ar)
+        # print ">>>",lnL_T, k_T, AS_alleleratio
+        lnL += lnL_T
+        BIC += -2*lnL_T
+
+    # control
+    tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
+
+    if tn_C == 0:
+        pass
+    else:
+        # We assume control will not have allele preference
+        (lnL_C, k_C) = GreedyMaxFunctionNoAS(top1_bq_C.shape[0],
+                                             top2_bq_C.shape[0],
+                                             tn_C,
+                                             top1_bq_C,
+                                             top2_bq_C)
+        lnL += lnL_C
+        BIC += -2*lnL_C
+
+    # we penalize big model depending on the number of reads/samples
+    if tn_T == 0:
+        BIC += log(tn_C)
+    elif tn_C == 0:
+        BIC += 2 * log(tn_T)
+    else:
+        BIC += 2 * log(tn_T) + log(tn_C)
+
+    return (lnL, BIC)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cfunc
+def GreedyMaxFunctionAS(m: cython.int,
+                        n: cython.int,
+                        tn: cython.int,
+                        me: cnp.ndarray(cython.int, ndim=1),
+                        ne: cnp.ndarray(cython.int, ndim=1),
+                        max_allowed_ar: cython.float = 0.99) -> tuple:
+    """Return lnL, k and alleleratio in tuple.
+
+    Note: I only translate Liqing's C++ code into pyx here. Haven't
+    done any review.
+
+    """
+    dnew: cython.double
+    dold: cython.double
+    rold: cython.double
+    rnew: cython.double
+    kold: cython.int
+    knew: cython.int
+    btemp: bool
+    k0: cython.int
+    dl: cython.double
+    dr: cython.double
+    d0: cython.double
+    d1l: cython.double
+    d1r: cython.double
+
+    assert m+n == tn
+    btemp = False
+    if tn == 1:                 # only 1 read; I don't expect this to be run...
+        dl = calculate_ln(m, n, tn, me, ne, 0, 0)
+        dr = calculate_ln(m, n, tn, me, ne, 1, 1)
+
+        if dl > dr:
+            k = 0
+            return (dl, 0, 0)
+        else:
+            k = 1
+            return (dr, 1, 1)
+    elif m == 0:                          # no top1 nt
+        return (calculate_ln(m, n, tn, me, ne, 0, m, max_allowed_ar),
+                m,
+                1-max_allowed_ar)
+        # k0 = m + 1
+    elif m == tn:                         # all reads are top1
+        return (calculate_ln(m, n, tn, me, ne, 1, m, max_allowed_ar),
+                m,
+                max_allowed_ar)
+    else:
+        k0 = m
+
+    d0 = calculate_ln(m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar)
+    d1l = calculate_ln(m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar)
+    d1r = calculate_ln(m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar)
+
+    if d0 > d1l-1e-8 and d0 > d1r-1e-8:
+        k = k0
+        ar = float(k0)/tn
+        return (d0, k, ar)
+    elif d1l > d0:
+        dold = d1l
+        kold = k0-1
+        rold = float(k0-1)/tn
+        while kold > 1:  # disable: when kold=1 still run, than knew=0 is the final run
+            knew = kold - 1
+            rnew = float(knew)/tn
+
+            dnew = calculate_ln(m,
+                                n,
+                                tn,
+                                me,
+                                ne,
+                                rnew,
+                                knew,
+                                max_allowed_ar)
+
+            if (dnew-1e-8 < dold):
+                btemp = True
+                break
+            kold = knew
+            dold = dnew
+            rold = rnew
+
+        if btemp:               # maximum L value is in [1,m-1];
+            k = kold
+            ar = rold
+            return (dold, k, ar)
+        else:                   # L(k=0) is the max for [0,m-1]
+            k = kold
+            ar = rold
+            return (dold, k, ar)
+
+    elif d1r > d0:
+        dold = d1r
+        kold = k0 + 1
+        rold = float(k0 + 1)/tn
+        while kold < tn - 1:  # //disable: when kold=tn-1 still run, than knew=tn is the final run
+            knew = kold + 1
+
+            rnew = float(knew)/tn
+
+            dnew = calculate_ln(m,
+                                n,
+                                tn,
+                                me,
+                                ne,
+                                rnew,
+                                knew,
+                                max_allowed_ar)
+
+            if dnew - 1e-8 < dold:
+                btemp = True
+                break
+            kold = knew
+            dold = dnew
+            rold = rnew
+
+        if btemp:               # maximum L value is in [m+1,tn-1]
+            k = kold
+            ar = rold
+            return (dold, k, ar)
+        else:                   # L(k=tn) is the max for [m+1,tn]
+            k = kold
+            ar = rold
+            return (dold, k, ar)
+    else:
+        raise Exception("error in GreedyMaxFunctionAS")
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cfunc
+def GreedyMaxFunctionNoAS(m: cython.int,
+                          n: cython.int,
+                          tn: cython.int,
+                          me: cnp.ndarray(cython.int, ndim=1),
+                          ne: cnp.ndarray(cython.int, ndim=1)) -> tuple:
+    """Return lnL, and k in tuple.
+
+    Note: I only translate Liqing's C++ code into pyx here. Haven't
+    done any review.
+
+    """
+    dnew: cython.double
+    dold: cython.double
+    kold: cython.int
+    knew: cython.int
+    btemp: bool
+    k0: cython.int
+    bg_r: cython.double
+    dl: cython.double
+    dr: cython.double
+    d0: cython.double
+    d1l: cython.double
+    d1r: cython.double
+
+    btemp = False
+    bg_r = 0.5
+
+    if tn == 1:
+        dl = calculate_ln(m, n, tn, me, ne, bg_r, 0)
+        dr = calculate_ln(m, n, tn, me, ne, bg_r, 1)
+        if dl > dr:
+            k = 0
+            return (dl, 0)
+        else:
+            k = 1
+            return (dr, 1)
+    elif m == 0:                          # no top1 nt
+        return (calculate_ln(m, n, tn, me, ne, bg_r, m), m)
+        # k0 = m + 1
+    elif m == tn:                         # all reads are top1
+        return (calculate_ln(m, n, tn, me, ne, bg_r, m), m)
+    # elif m == 0:
+    #    k0 = m + 1
+    # elif m == tn:
+    #    k0 = m - 1
+    else:
+        k0 = m
+
+    d0 = calculate_ln(m, n, tn, me, ne, bg_r, k0)
+    d1l = calculate_ln(m, n, tn, me, ne, bg_r, k0 - 1)
+    d1r = calculate_ln(m, n, tn, me, ne, bg_r, k0 + 1)
+
+    if d0 > d1l - 1e-8 and d0 > d1r - 1e-8:
+        k = k0
+        return (d0, k)
+    elif d1l > d0:
+        dold = d1l
+        kold = k0 - 1
+        while kold >= 1:   # //when kold=1 still run, than knew=0 is the final run
+            knew = kold - 1
+            dnew = calculate_ln(m, n, tn, me, ne, bg_r, knew)
+            if dnew - 1e-8 < dold:
+                btemp = True
+                break
+            kold = knew
+            dold = dnew
+                        
+        if btemp:               # //maximum L value is in [1,m-1];
+            k = kold
+            return (dold, k)
+        else:                   # //L(k=0) is the max for [0,m-1]
+            k = kold
+            return (dold, k)
+    elif d1r > d0:
+        dold = d1r
+        kold = k0 + 1
+        while kold <= tn - 1:  # //when kold=tn-1 still run, than knew=tn is the final run
+            knew = kold + 1
+            dnew = calculate_ln(m, n, tn, me, ne, bg_r, knew)
+            if dnew - 1e-8 < dold:
+                btemp = True
+                break
+            kold = knew
+            dold = dnew
+
+        if btemp:               # //maximum L value is in [m+1,tn-1]
+            k = kold
+            return (dold, k)
+        else:                   # //L(k=tn) is the max for [m+1,tn]
+            k = kold
+            return (dold, k)
+    else:
+        raise Exception("error in GreedyMaxFunctionNoAS")
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.cfunc
+def calculate_ln(m: cython.int,
+                 n: cython.int,
+                 tn: cython.int,
+                 me: cnp.ndarray(cython.int, ndim=1),
+                 ne: cnp.ndarray(cython.int, ndim=1),
+                 r: cython.double,
+                 k: cython.int,
+                 max_allowed_r: cython.float = 0.99):
+    """Calculate log likelihood given quality of top1 and top2, the
+    ratio r and the observed k.
+
+    """
+    i: cython.int
+    lnL: cython.double
+    e: cython.double
+
+    lnL = 0
+
+    # r is extremely high or
+    if r > max_allowed_r or r < 1 - max_allowed_r:
+        lnL += k*log(max_allowed_r) + (tn-k)*log(1 - max_allowed_r)
+    else:
+        lnL += k*log(r) + (tn-k)*log(1-r)
+
+    # it's entirely biased toward 1 allele
+    if k == 0 or k == tn:
+        pass
+    elif k <= tn/2:
+        for i in range(k):
+            lnL += log(float(tn-i)/(k-i))
+    else:
+        for i in range(tn-k):
+            lnL += log(float(tn-i)/(tn-k-i))
+
+    for i in range(m):
+        e = exp(- me[i] * LN10_tenth)
+        lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn))
+
+    for i in range(n):
+        e = exp(- ne[i] * LN10_tenth)
+        lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn))
+
+    return lnL
+
+
+@cython.ccall
+def calculate_GQ(lnL1: cython.double,
+                 lnL2: cython.double,
+                 lnL3: cython.double) -> cython.int:
+    """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3))
+    """
+    L1: cython.double
+    L2: cython.double
+    L3: cython.double
+    s: cython.double
+    tmp: cython.double
+    GQ_score: cython.int
+
+    # L1 = exp(lnL1-lnL1)
+    L1 = 1
+    L2 = exp(lnL2-lnL1)
+    L3 = exp(lnL3-lnL1)
+
+    # if L1 > 1:
+    #    L1 = 1
+
+    if L2 > 1:
+        L2 = 1
+    if L3 > 1:
+        L3 = 1
+    # if(L1<1e-110) L1=1e-110;
+    if L2 < 1e-110:
+        L2 = 1e-110
+    if L3 < 1e-110:
+        L3 = 1e-110
+
+    s = L1 + L2 + L3
+    tmp = (L2 + L3)/s
+    if tmp > 1e-110:
+        GQ_score = (int)(-4.34294*log(tmp))
+    else:
+        GQ_score = 255
+
+    return GQ_score
+
+
+@cython.ccall
+def calculate_GQ_heterASsig(lnL1: cython.double,
+                            lnL2: cython.double) -> cython.int:
+    """
+    """
+    L1: cython.double
+    L2: cython.double
+    s: cython.double
+    tmp: cython.double
+    ASsig_score: cython.int
+
+    # L1=exp(2.7182818,lnL1-lnL1)
+    L1 = 1
+    L2 = exp(lnL2 - lnL1)
+
+    # if L1 > 1:
+    #    L1 = 1
+    if L2 > 1:
+        L2 = 1
+    # if L1 < 1e-110:
+    #    L1 = 1e-110
+    if L2 < 1e-110:
+        L2 = 1e-110
+
+    s = L1 + L2
+    tmp = L2/s
+    if tmp > 1e-110:
+        ASsig_score = (int)(-4.34294*log(tmp))
+    else:
+        ASsig_score = 255
+
+    return ASsig_score
diff --git a/MACS3/Signal/VariantStat.pyx b/MACS3/Signal/VariantStat.pyx
deleted file mode 100644
index be2699ac..00000000
--- a/MACS3/Signal/VariantStat.pyx
+++ /dev/null
@@ -1,461 +0,0 @@
-# cython: language_level=3
-# cython: profile=True
-# Time-stamp: <2020-12-04 18:41:28 Tao Liu>
-
-"""Module for SAPPER BAMParser class
-
-Copyright (c) 2017 Tao Liu <tliu4@buffalo.edu>
-
-This code is free software; you can redistribute it and/or modify it
-under the terms of the BSD License (see the file COPYING included
-with the distribution).
-
-@status:  experimental
-@version: $Revision$
-@author:  Tao Liu
-@contact: tliu4@buffalo.edu
-"""
-
-# ------------------------------------
-# python modules
-# ------------------------------------
-from cpython cimport bool
-
-cimport cython
-
-import numpy as np
-cimport numpy as np
-
-ctypedef np.float32_t float32_t
-ctypedef np.int32_t int32_t
-
-#from libc.math cimport log10, log, exp, M_LN10 #,fabs,log1p
-#from libc.math cimport M_LN10
-from math import log1p, exp, log
-
-LN10 = 2.3025850929940458
-LN10_tenth = 0.23025850929940458
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cpdef tuple CalModel_Homo( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C):
-    """Return (lnL, BIC).
-
-    """
-    cdef:
-        int i
-        double lnL, BIC
-
-    lnL=0
-    # Phred score is Phred = -10log_{10} E, where E is the error rate.
-    # to get the 1-E: 1-E = 1-exp( Phred/-10*M_LN10 ) = 1-exp( Phred * -LOG10_E_tenth )
-    for i in range( top1_bq_T.shape[0] ):
-        lnL += log1p( -exp(-top1_bq_T[ i ]*LN10_tenth) )
-    for i in range( top1_bq_C.shape[0] ):
-        lnL += log1p( -exp(-top1_bq_C[ i ]*LN10_tenth) )
-        
-    for i in range( top2_bq_T.shape[0] ):
-        lnL += log( exp(-top2_bq_T[ i ]*LN10_tenth) )
-    for i in range( top2_bq_C.shape[0] ):
-        lnL += log( exp(-top2_bq_C[ i ]*LN10_tenth) )        
-
-    BIC = -2*lnL                # no free variable, no penalty
-    return (lnL, BIC)
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cpdef tuple CalModel_Heter_noAS( np.ndarray[int32_t, ndim=1] top1_bq_T,np.ndarray[int32_t, ndim=1] top1_bq_C,np.ndarray[int32_t, ndim=1] top2_bq_T,np.ndarray[int32_t, ndim=1] top2_bq_C ):
-    """Return (lnL, BIC)
-
-    k_T
-    k_C
-    """
-    cdef:
-        int k_T, k_C
-        double lnL, BIC
-        int i
-        int tn_T, tn_C, tn  # total observed NTs
-        double lnL_T, lnL_C # log likelihood for treatment and control
-
-    lnL = 0
-    BIC = 0
-    #for k_T
-    # total oberseved treatment reads from top1 and top2 NTs
-    tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
-    
-    if tn_T == 0:
-        raise Exception("Total number of treatment reads is 0!")
-    else:
-        ( lnL_T, k_T ) = GreedyMaxFunctionNoAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T )
-        lnL += lnL_T
-        BIC += -2*lnL_T
-
-    #for k_C
-    tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
-
-    if tn_C == 0:
-        pass
-    else:
-        ( lnL_C, k_C )  = GreedyMaxFunctionNoAS( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C )
-        lnL += lnL_C
-        BIC += -2*lnL_C
-
-    tn = tn_C + tn_T
-
-    # we penalize big model depending on the number of reads/samples
-    if tn_T == 0:
-        BIC += log( tn_C )
-    elif tn_C == 0:
-        BIC += log( tn_T )
-    else:
-        BIC += log( tn_T ) + log( tn_C )
-
-    return ( lnL, BIC )
-
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cpdef tuple CalModel_Heter_AS( np.ndarray[int32_t, ndim=1] top1_bq_T, np.ndarray[int32_t, ndim=1] top1_bq_C, np.ndarray[int32_t, ndim=1] top2_bq_T, np.ndarray[int32_t, ndim=1] top2_bq_C, float max_allowed_ar = 0.99 ):
-    """Return (lnL, BIC)
-
-    kc
-    ki
-    AS_alleleratio
-    """
-    cdef:
-        int k_T, k_C
-        double lnL, BIC
-        int i
-        int tn_T, tn_C, tn  # total observed NTs
-        double lnL_T, lnL_C # log likelihood for treatment and control
-        double AS_alleleratio   # allele ratio
-
-    lnL = 0
-    BIC = 0
-
-    #assert top2_bq_T.shape[0] + top2_bq_C.shape[0] > 0, "Total number of top2 nt should not be zero while using this function: CalModel_Heter_AS!"
-
-    # Treatment
-    tn_T = top1_bq_T.shape[0] + top2_bq_T.shape[0]
-
-    if tn_T == 0:
-        raise Exception("Total number of treatment reads is 0!")
-    else:
-        ( lnL_T, k_T, AS_alleleratio ) = GreedyMaxFunctionAS( top1_bq_T.shape[0], top2_bq_T.shape[0], tn_T, top1_bq_T, top2_bq_T, max_allowed_ar)
-        #print ">>>",lnL_T, k_T, AS_alleleratio
-        lnL += lnL_T
-        BIC += -2*lnL_T
-
-    # control
-    tn_C = top1_bq_C.shape[0] + top2_bq_C.shape[0]
-
-    if tn_C == 0:
-        pass
-    else:
-        # We assume control will not have allele preference
-        ( lnL_C, k_C ) = GreedyMaxFunctionNoAS ( top1_bq_C.shape[0], top2_bq_C.shape[0], tn_C, top1_bq_C, top2_bq_C)
-        lnL += lnL_C
-        BIC += -2*lnL_C
-
-    tn = tn_C + tn_T
-
-    # we penalize big model depending on the number of reads/samples
-    if tn_T == 0:
-        BIC += log( tn_C )
-    elif tn_C == 0:
-        BIC += 2 * log( tn_T )
-    else:
-        BIC += 2 * log( tn_T ) + log( tn_C )
-    
-    return (lnL, BIC)
-
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cdef tuple GreedyMaxFunctionAS( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, float max_allowed_ar = 0.99 ):
-    """Return lnL, k and alleleratio in tuple.
-
-    Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
-    """
-    cdef:
-        double dnew, dold, rold, rnew
-        int kold, knew
-        bool btemp
-        int k0
-        double dl, dr, d0, d1l, d1r
-
-    assert m+n == tn
-    btemp = False
-    if tn == 1:                 # only 1 read; I don't expect this to be run...
-        dl=calculate_ln(m,n,tn,me,ne,0,0);
-        dr=calculate_ln(m,n,tn,me,ne,1,1);
-
-        if dl>dr:
-            k = 0
-            return ( dl, 0, 0 )
-        else:
-            k = 1
-            return ( dr, 1, 1 )
-    elif m == 0:                          #no top1 nt
-        return ( calculate_ln( m, n, tn, me, ne, 0, m, max_allowed_ar ), m, 1-max_allowed_ar )
-        #k0 = m + 1
-    elif m == tn:                         #all reads are top1
-        return ( calculate_ln( m, n, tn, me, ne, 1, m, max_allowed_ar ), m, max_allowed_ar )
-    else:
-        k0 = m
-
-    d0  = calculate_ln( m, n, tn, me, ne, float(k0)/tn, k0, max_allowed_ar )
-    d1l = calculate_ln( m, n, tn, me, ne, float(k0-1)/tn, k0-1, max_allowed_ar )
-    d1r = calculate_ln( m, n, tn, me, ne, float(k0+1)/tn, k0+1, max_allowed_ar )
-
-    if d0 > d1l-1e-8 and d0 > d1r-1e-8:
-        k = k0
-        ar = float(k0)/tn
-        return ( d0, k, ar )
-    elif d1l > d0:
-        dold = d1l
-        kold = k0-1
-        rold = float(k0-1)/tn
-        while kold > 1: #disable: when kold=1 still run, than knew=0 is the final run
-            knew = kold - 1
-            rnew = float(knew)/tn
-
-            dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
-
-            if(dnew-1e-8 < dold) :
-                btemp=True
-                break
-            kold=knew
-            dold=dnew
-            rold=rnew
-
-        if btemp: #maximum L value is in [1,m-1];
-            k = kold 
-            ar= rold
-            return ( dold, k, ar )
-        else: #L(k=0) is the max for [0,m-1]
-            k = kold
-            ar = rold
-            return ( dold, k, ar )
-
-    elif d1r > d0:
-        dold = d1r
-        kold = k0 + 1
-        rold = float(k0 + 1)/tn
-        while kold < tn - 1: #//disable: when kold=tn-1 still run, than knew=tn is the final run
-            knew = kold + 1
-            
-            rnew = float(knew)/tn
-            
-            dnew = calculate_ln( m,n,tn,me,ne,rnew,knew, max_allowed_ar )
-            
-            if dnew - 1e-8 < dold: 
-                btemp = True
-                break
-            kold = knew
-            dold = dnew
-            rold = rnew
-
-        if btemp: #maximum L value is in [m+1,tn-1]
-            k = kold
-            ar= rold
-            return ( dold, k, ar )
-        else: #L(k=tn) is the max for [m+1,tn]
-            k = kold
-            ar = rold
-            return ( dold, k, ar )
-    else:
-        raise Exception("error in GreedyMaxFunctionAS")
-
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cdef tuple GreedyMaxFunctionNoAS (int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne ):
-    """Return lnL, and k in tuple.
-
-    Note: I only translate Liqing's C++ code into pyx here. Haven't done any review.
-    """
-    cdef:
-        double dnew, dold
-        int kold, knew
-        bool btemp
-        int k0
-        double bg_r, dl, dr, d0, d1l, d1r
-
-    btemp = False
-    bg_r = 0.5
-
-    if tn == 1:
-        dl = calculate_ln( m, n, tn, me, ne, bg_r, 0)
-        dr= calculate_ln( m, n, tn, me, ne, bg_r, 1)
-        if dl > dr:
-            k = 0
-            return ( dl, 0 )
-        else:
-            k = 1
-            return ( dr, 1 )
-    elif m == 0:                          #no top1 nt
-        return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
-        #k0 = m + 1
-    elif m == tn:                         #all reads are top1
-        return ( calculate_ln( m, n, tn, me, ne, bg_r, m ), m )
-    #elif m == 0:
-    #    k0 = m + 1
-    #elif m == tn:
-    #    k0 = m - 1
-    else:
-        k0 = m
-
-    d0  = calculate_ln( m, n, tn, me, ne, bg_r, k0)
-    d1l = calculate_ln( m, n, tn, me, ne, bg_r, k0 - 1)
-    d1r = calculate_ln( m, n, tn, me, ne, bg_r, k0 + 1)
-
-    if d0 > d1l - 1e-8 and d0 > d1r - 1e-8:
-        k = k0
-        return ( d0, k )
-    elif d1l > d0:
-        dold = d1l
-        kold=k0 - 1
-        while kold >= 1:   #//when kold=1 still run, than knew=0 is the final run
-            knew = kold - 1
-            dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
-            if dnew - 1e-8 < dold:
-                btemp = True
-                break
-            kold=knew
-            dold=dnew
-                        
-        if btemp: #//maximum L value is in [1,m-1];
-            k = kold
-            return ( dold, k )
-        else: #//L(k=0) is the max for [0,m-1]
-            k = kold
-            return ( dold, k )
-    elif d1r > d0:
-        dold = d1r
-        kold = k0 + 1
-        while kold <= tn - 1: #//when kold=tn-1 still run, than knew=tn is the final run
-            knew = kold + 1
-            dnew = calculate_ln( m, n, tn, me, ne, bg_r, knew )
-            if dnew - 1e-8 < dold:
-                btemp = True
-                break
-            kold = knew
-            dold = dnew
-
-        if btemp: #//maximum L value is in [m+1,tn-1]
-            k = kold
-            return ( dold, k )
-        else: #//L(k=tn) is the max for [m+1,tn]
-            k = kold
-            return ( dold, k )
-    else:
-        raise Exception("error in GreedyMaxFunctionNoAS")
-
-@cython.boundscheck(False) # turn off bounds-checking for entire function
-@cython.wraparound(False)  # turn off negative index wrapping for entire function
-cdef calculate_ln( int m, int n, int tn, np.ndarray[int32_t, ndim=1] me, np.ndarray[int32_t, ndim=1] ne, double r, int k, float max_allowed_r = 0.99):
-    """Calculate log likelihood given quality of top1 and top2, the ratio r and the observed k.
-
-    """
-    cdef:
-        int i
-        double lnL
-        double e
-
-    lnL = 0
-
-    if r > max_allowed_r or r < 1 - max_allowed_r: # r is extremely high or
-        #print "here1"
-        lnL += k*log( max_allowed_r ) + (tn-k)*log( 1- max_allowed_r) #-10000
-    else:
-        lnL += k*log( r ) + (tn-k)*log(1-r)
-
-    if k == 0 or k == tn:       # it's entirely biased toward 1 allele
-        #print "here2"
-        pass
-        #lnL += k*log( max_allowed_r ) #-10000
-        #lnL += -10000
-    elif k <= tn/2:
-        for i in range( k ):
-            lnL += log(float(tn-i)/(k-i))
-    else:
-        for i in range( tn-k ):
-            lnL += log(float(tn-i)/(tn-k-i))
-
-    for i in range( m ):
-        e = exp( - me[ i ] * LN10_tenth )
-        lnL += log((1-e)*(float(k)/tn) + e*(1-float(k)/tn))
-    
-    for i in range( n ):
-        e = exp( - ne[ i ] * LN10_tenth )
-        lnL += log((1-e)*(1-float(k)/tn) + e*(float(k)/tn))
-
-    #print r,k,lnL
-    return lnL
-
-cpdef int calculate_GQ ( double lnL1, double lnL2, double lnL3):
-    """GQ1 = -10*log_{10}((L2+L3)/(L1+L2+L3))
-       
-       
-    """
-    cdef:
-        double L1, L2, L3, sum, tmp
-        int GQ_score
-        
-    #L1 = exp(lnL1-lnL1)
-    L1 = 1
-    L2 = exp(lnL2-lnL1)
-    L3 = exp(lnL3-lnL1)
-
-    #if L1 > 1:
-    #    L1 = 1
-
-    if L2 > 1:
-        L2 = 1
-    if L3 > 1:
-        L3 = 1
-    #if(L1<1e-110) L1=1e-110;
-    if L2 < 1e-110:
-        L2=1e-110
-    if L3 < 1e-110:
-        L3 = 1e-110
-
-    sum = L1 + L2 + L3
-    tmp = ( L2 + L3 )/sum
-    if tmp > 1e-110:
-        GQ_score = (int)(-4.34294*log(tmp))
-    else:
-        GQ_score = 255
-
-    return GQ_score;
-
-cpdef int calculate_GQ_heterASsig( double lnL1, double lnL2):
-    """
-    """
-    cdef:
-        double L1, L2, sum, tmp
-        int ASsig_score
-
-    #L1=exp(2.7182818,lnL1-lnL1)
-    L1 = 1
-    L2 = exp( lnL2 - lnL1 )
-
-    #if L1 > 1:
-    #    L1 = 1
-    if L2 > 1:
-        L2 = 1
-    #if L1 < 1e-110:
-    #    L1 = 1e-110
-    if L2 < 1e-110:
-        L2 = 1e-110
-
-    sum = L1 + L2
-    tmp = L2/sum
-    if tmp > 1e-110:
-        ASsig_score = (int)(-4.34294*log(tmp))
-    else:
-        ASsig_score = 255
-
-    return ASsig_score
-
diff --git a/pyproject.toml b/pyproject.toml
index 1f4c6cad..9babe702 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires=['setuptools>=68.0', 'numpy>=1.25,<2.0.0', 'scipy>=1.12', 'cykhash>=2.0,<3.0', 'Cython>=3.0,<3.1', 'scikit-learn>=1.3', 'hmmlearn>=0.3.2']
+requires=['setuptools>=68.0', 'numpy>=1.25,<2', 'scipy>=1.12', 'cykhash>=2.0', 'Cython>=3.0', 'scikit-learn>=1.3', 'hmmlearn>=0.3.2']
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -24,11 +24,11 @@ classifiers =['Development Status :: 5 - Production/Stable',
               'Programming Language :: Python :: 3.11',
               'Programming Language :: Python :: 3.12',
               'Programming Language :: Cython']
-dependencies = ["numpy>=1.25,<2.0.0",
+dependencies = ["numpy>=1.25,<2",
 	        "scipy>=1.12",
              	"hmmlearn>=0.3.2",
                 "scikit-learn>=1.3",
-                "cykhash>=2.0,<3.0"]
+                "cykhash>=2.0"]
 
 [project.urls]
 Homepage = "https://https://macs3-project.github.io/MACS/"
diff --git a/setup.py b/setup.py
index a36e558b..96a3f543 100644
--- a/setup.py
+++ b/setup.py
@@ -105,51 +105,48 @@ def main():
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.PeakModel",
-                             ["MACS3/Signal/PeakModel.pyx"],
+                             ["MACS3/Signal/PeakModel.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.PeakDetect",
-                             ["MACS3/Signal/PeakDetect.pyx"],
+                             ["MACS3/Signal/PeakDetect.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.SignalProcessing",
-                             ["MACS3/Signal/SignalProcessing.pyx"],
+                             ["MACS3/Signal/SignalProcessing.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.FixWidthTrack",
-                             ["MACS3/Signal/FixWidthTrack.pyx"],
+                             ["MACS3/Signal/FixWidthTrack.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.PairedEndTrack",
-                             ["MACS3/Signal/PairedEndTrack.pyx"],
+                             ["MACS3/Signal/PairedEndTrack.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.BedGraph",
-                             ["MACS3/Signal/BedGraph.pyx"],
+                             ["MACS3/Signal/BedGraph.py"],
                              libraries=["m"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.ScoreTrack",
-                             ["MACS3/Signal/ScoreTrack.pyx"],
+                             ["MACS3/Signal/ScoreTrack.py"],
                              libraries=["m"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.CallPeakUnit",
-                             ["MACS3/Signal/CallPeakUnit.pyx"],
+                             ["MACS3/Signal/CallPeakUnit.py"],
                              libraries=["m"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.VariantStat",
-                             ["MACS3/Signal/VariantStat.pyx"],
-                             libraries=["m"],
+                             ["MACS3/Signal/VariantStat.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.ReadAlignment",
-                             ["MACS3/Signal/ReadAlignment.pyx"],
-                             libraries=["m"],
-                             include_dirs=numpy_include_dir,
+                             ["MACS3/Signal/ReadAlignment.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.RACollection",
-                             ["MACS3/Signal/RACollection.pyx",
+                             ["MACS3/Signal/RACollection.py",
                               "MACS3/fermi-lite/bfc.c",
                               "MACS3/fermi-lite/bseq.c",
                               "MACS3/fermi-lite/bubble.c",
@@ -165,30 +162,25 @@ def main():
                               "MACS3/fermi-lite/unitig.c",
                               "MACS3/Signal/swalign.c"],
                              libraries=["m", "z"],
-                             include_dirs=numpy_include_dir+["./",
-                                                             "./MACS3/fermi-lite/",
-                                                             "./MACS3/Signal/"],
+                             include_dirs=["./",
+                                           "./MACS3/fermi-lite/",
+                                           "./MACS3/Signal/"],
                              extra_compile_args=extra_c_args+extra_c_args_for_fermi),
                    Extension("MACS3.Signal.UnitigRACollection",
-                             ["MACS3/Signal/UnitigRACollection.pyx"],
-                             libraries=["m"],
-                             include_dirs=numpy_include_dir,
+                             ["MACS3/Signal/UnitigRACollection.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.PosReadsInfo",
-                             ["MACS3/Signal/PosReadsInfo.pyx"],
-                             libraries=["m"],
+                             ["MACS3/Signal/PosReadsInfo.py"],
                              include_dirs=numpy_include_dir,
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.Signal.PeakVariants",
-                             ["MACS3/Signal/PeakVariants.pyx"],
-                             libraries=["m"],
-                             include_dirs=numpy_include_dir,
+                             ["MACS3/Signal/PeakVariants.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.IO.Parser",
                              ["MACS3/IO/Parser.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.IO.PeakIO",
-                             ["MACS3/IO/PeakIO.pyx"],
+                             ["MACS3/IO/PeakIO.py"],
                              extra_compile_args=extra_c_args),
                    Extension("MACS3.IO.BedGraphIO",
                              ["MACS3/IO/BedGraphIO.py"],
diff --git a/test/test.fragments.tsv.gz b/test/test.fragments.tsv.gz
new file mode 100644
index 00000000..2b155e16
Binary files /dev/null and b/test/test.fragments.tsv.gz differ
diff --git a/test/test_HMMR_poisson.py b/test/test_HMMR_poisson.py
index 58efdb76..a71f1f99 100644
--- a/test/test_HMMR_poisson.py
+++ b/test/test_HMMR_poisson.py
@@ -1,14 +1,10 @@
-
 import unittest
-import pytest
 # from MACS3.Signal.HMMR_HMM import *
 import numpy as np
 import numpy.testing as npt
-import numpy as np
-import hmmlearn
 from hmmlearn.hmm import PoissonHMM
-from sklearn import cluster
-import json
+# from sklearn import cluster
+# import json
 # class hmmlearn.hmm.PoissonHMM(n_components=1, startprob_prior=1.0, transmat_prior=1.0, lambdas_prior=0.0, lambdas_weight=0.0, 
 #                       algorithm='viterbi', random_state=None, n_iter=10, tol=0.01, verbose=False, params='stl', init_params='stl', implementation='log')
 
@@ -16,21 +12,28 @@
 #                       means_prior=0, means_weight=0, covars_prior=0.01, covars_weight=1, algorithm='viterbi', random_state=None, n_iter=10, tol=0.01, verbose=False, 
 #                       params='stmc', init_params='stmc', implementation='log')
 
-def hmm_training (training_data, training_data_lengths, n_states = 3, random_seed = 12345):
+
+def hmm_training(training_data, training_data_lengths, n_states=3, random_seed=12345):
     rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(random_seed)))
-    hmm_model = PoissonHMM( n_components= n_states, random_state = rs, verbose = False )
-    hmm_model = hmm_model.fit( training_data, training_data_lengths )
+    hmm_model = PoissonHMM(n_components=n_states, random_state=rs, verbose=False)
+    hmm_model = hmm_model.fit(training_data, training_data_lengths)
     assert hmm_model.n_features == 4
     return hmm_model
 
-def hmm_predict( signals, lens, hmm_model ):
-    predictions = hmm_model.predict_proba( signals, lens )
+
+def hmm_predict(signals, lens, hmm_model):
+    predictions = hmm_model.predict_proba(signals, lens)
     return predictions
 
+
 class Test_HMM_train_poisson(unittest.TestCase):
-    def setUp( self ):
-        self.training_data = np.loadtxt("test/large_training_data.txt", delimiter="\t", dtype="float", usecols=(2,3,4,5)).astype(int).tolist()
-        self.training_data_lengths = np.loadtxt('test/large_training_lengths.txt', dtype="int").tolist()
+    def setUp(self):
+        self.training_data = np.loadtxt("test/large_training_data.txt",
+                                        delimiter="\t",
+                                        dtype="float",
+                                        usecols=(2, 3, 4, 5)).astype(int).tolist()
+        self.training_data_lengths = np.loadtxt('test/large_training_lengths.txt',
+                                                dtype="int").tolist()
         self.expected_converged = True
         self.not_expected_transmat = None
         self.n_features = 4
@@ -38,40 +41,49 @@ def setUp( self ):
         self.transmat = [[9.87606722e-01, 1.23932782e-02, 1.75299652e-11],
                          [1.76603580e-02, 9.64232293e-01, 1.81073490e-02],
                          [4.87992301e-14, 2.70319349e-02, 9.72968065e-01]]
-        self.lambdas = [[ 0.03809295,  0.62378578,  0.68739807,  0.        ],
-                        [ 0.23243362,  3.4420467,   4.256037,    0.        ],
-                        [ 2.58132377, 11.45924282,  8.13706237,  0.        ]]
+        self.lambdas = [[0.03809295, 0.62378578, 0.68739807, 0.],
+                        [0.23243362, 3.4420467, 4.256037, 0.],
+                        [2.58132377, 11.45924282, 8.13706237, 0.]]
         # for prediction
-        self.prediction_data = np.loadtxt("test/small_prediction_data.txt", delimiter="\t", dtype="float", usecols=(2,3,4,5)).astype(int).tolist()
-        self.prediction_data_lengths = np.loadtxt('test/small_prediction_lengths.txt', dtype="int").tolist()
-        self.predictions = np.loadtxt('test/small_prediction_results_poisson.txt', delimiter="\t", dtype="float").tolist()
+        self.prediction_data = np.loadtxt("test/small_prediction_data.txt",
+                                          delimiter="\t",
+                                          dtype="float",
+                                          usecols=(2, 3, 4, 5)).astype(int).tolist()
+        self.prediction_data_lengths = np.loadtxt('test/small_prediction_lengths.txt',
+                                                  dtype="int").tolist()
+        self.predictions = np.loadtxt('test/small_prediction_results_poisson.txt',
+                                      delimiter="\t", dtype="float").tolist()
 
-    def test_training( self ):
+    def test_training(self):
         # test hmm_training:
-        model = hmm_training(training_data = self.training_data, training_data_lengths = self.training_data_lengths, n_states = 3, random_seed = 12345)
-        print(model.startprob_)
-        print(model.transmat_)
-        print(model.lambdas_)
-        print(model.n_features)
-        self.assertEqual( model.monitor_.converged, self.expected_converged )
-        self.assertNotEqual( model.transmat_.tolist(), self.not_expected_transmat )
-        npt.assert_allclose( model.startprob_.tolist(), self.startprob )
+        model = hmm_training(training_data=self.training_data,
+                             training_data_lengths=self.training_data_lengths,
+                             n_states=3,
+                             random_seed=12345)
+        # print(model.startprob_)
+        # print(model.transmat_)
+        # print(model.lambdas_)
+        # print(model.n_features)
+        self.assertEqual(model.monitor_.converged, self.expected_converged)
+        self.assertNotEqual(model.transmat_.tolist(), self.not_expected_transmat)
+        npt.assert_allclose(model.startprob_.tolist(), self.startprob)
         npt.assert_allclose(model.transmat_, self.transmat)
         npt.assert_allclose(model.lambdas_, self.lambdas)
         npt.assert_allclose(model.n_features, self.n_features)
 
-    def test_predict( self ):
+    def test_predict(self):
         # test hmm_predict
-        hmm_model = PoissonHMM( n_components=3 )
+        hmm_model = PoissonHMM(n_components=3)
         hmm_model.startprob_ = np.array(self.startprob)
         hmm_model.transmat_ = np.array(self.transmat)
         hmm_model.lambdas_ = np.array(self.lambdas)
         hmm_model.n_features = self.n_features
-        predictions = hmm_predict( self.prediction_data, self.prediction_data_lengths, hmm_model )
+        predictions = hmm_predict(self.prediction_data,
+                                  self.prediction_data_lengths,
+                                  hmm_model)
 
-        # This is to write the prediction results into a file for 'correct' answer 
+        # This is to write the prediction results into a file for 'correct' answer
         # with open("test/small_prediction_results_poisson.txt","w") as f:
         #    for x,y,z in predictions:
-        #        f.write( str(x)+"\t"+str(y)+"\t"+str(z)+"\n")
-        
-        npt.assert_allclose( predictions, self.predictions )
+        #        f.write(str(x)+"\t"+str(y)+"\t"+str(z)+"\n")
+        npt.assert_allclose(predictions, self.predictions)
diff --git a/test/test_PairedEndTrack.py b/test/test_PairedEndTrack.py
index e86924ec..3723c944 100644
--- a/test/test_PairedEndTrack.py
+++ b/test/test_PairedEndTrack.py
@@ -1,78 +1,177 @@
 #!/usr/bin/env python
-# Time-stamp: <2020-11-24 17:51:32 Tao Liu>
+# Time-stamp: <2024-10-15 16:07:27 Tao Liu>
 
 import unittest
+from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII
+import numpy as np
 
-from MACS3.Signal.PairedEndTrack import *
 
 class Test_PETrackI(unittest.TestCase):
-
     def setUp(self):
-
-        self.input_regions = [(b"chrY",0,100 ),
-                              (b"chrY",70,270 ),
-                              (b"chrY",70,100 ),
-                              (b"chrY",80,160 ),
-                              (b"chrY",80,160 ),
-                              (b"chrY",80,180 ),
-                              (b"chrY",80,180 ),
-                              (b"chrY",85,185 ),
-                              (b"chrY",85,285 ),
-                              (b"chrY",85,285 ),
-                              (b"chrY",85,285 ),
-                              (b"chrY",85,385 ),
-                              (b"chrY",90,190 ),
-                              (b"chrY",90,190 ),
-                              (b"chrY",90,191 ),
-                              (b"chrY",150,190 ),
-                              (b"chrY",150,250 ),
+        self.input_regions = [(b"chrY", 0, 100),
+                              (b"chrY", 70, 270),
+                              (b"chrY", 70, 100),
+                              (b"chrY", 80, 160),
+                              (b"chrY", 80, 160),
+                              (b"chrY", 80, 180),
+                              (b"chrY", 80, 180),
+                              (b"chrY", 85, 185),
+                              (b"chrY", 85, 285),
+                              (b"chrY", 85, 285),
+                              (b"chrY", 85, 285),
+                              (b"chrY", 85, 385),
+                              (b"chrY", 90, 190),
+                              (b"chrY", 90, 190),
+                              (b"chrY", 90, 191),
+                              (b"chrY", 150, 190),
+                              (b"chrY", 150, 250),
                               ]
-        self.t = sum([ x[2]-x[1] for x in self.input_regions ])
+        self.t = sum([x[2]-x[1] for x in self.input_regions])
 
     def test_add_loc(self):
         pe = PETrackI()
-        for ( c, l, r ) in self.input_regions:
+        for (c, l, r) in self.input_regions:
             pe.add_loc(c, l, r)
         pe.finalize()
         # roughly check the numbers...
-        self.assertEqual( pe.total, 17 )
-        self.assertEqual( pe.length, self.t )
+        self.assertEqual(pe.total, 17)
+        self.assertEqual(pe.length, self.t)
 
     def test_filter_dup(self):
         pe = PETrackI()
-        for ( c, l, r ) in self.input_regions:
+        for (c, l, r) in self.input_regions:
             pe.add_loc(c, l, r)
         pe.finalize()
         # roughly check the numbers...
-        self.assertEqual( pe.total, 17 )
-        self.assertEqual( pe.length, self.t )
+        self.assertEqual(pe.total, 17)
+        self.assertEqual(pe.length, self.t)
 
         # filter out more than 3 tags
-        pe.filter_dup( 3 )
-        self.assertEqual( pe.total, 17 )
+        pe.filter_dup(3)
+        self.assertEqual(pe.total, 17)
 
         # filter out more than 2 tags
-        pe.filter_dup( 2 )
-        self.assertEqual( pe.total, 16 )
+        pe.filter_dup(2)
+        self.assertEqual(pe.total, 16)
 
         # filter out more than 1 tag
-        pe.filter_dup( 1 )
-        self.assertEqual( pe.total, 12 )
-
+        pe.filter_dup(1)
+        self.assertEqual(pe.total, 12)
 
     def test_sample_num(self):
         pe = PETrackI()
-        for ( c, l, r ) in self.input_regions:
+        for (c, l, r) in self.input_regions:
             pe.add_loc(c, l, r)
         pe.finalize()
-        pe.sample_num( 10 )
-        self.assertEqual( pe.total, 10 )
+        pe.sample_num(10)
+        self.assertEqual(pe.total, 10)
 
     def test_sample_percent(self):
         pe = PETrackI()
-        for ( c, l, r ) in self.input_regions:
+        for (c, l, r) in self.input_regions:
             pe.add_loc(c, l, r)
         pe.finalize()
-        pe.sample_percent( 0.5 )
-        self.assertEqual( pe.total, 8 )
+        pe.sample_percent(0.5)
+        self.assertEqual(pe.total, 8)
+
+
+class Test_PETrackII(unittest.TestCase):
+    def setUp(self):
+        self.input_regions = [(b"chrY", 0, 100, b"0w#AAACGAAAGACTCGGA", 2),
+                              (b"chrY", 70, 170, b"0w#AAACGAAAGACTCGGA", 1),
+                              (b"chrY", 80, 190, b"0w#AAACGAAAGACTCGGA", 1),
+                              (b"chrY", 85, 180, b"0w#AAACGAAAGACTCGGA", 3),
+                              (b"chrY", 100, 190, b"0w#AAACGAAAGACTCGGA", 1),
+                              (b"chrY", 0, 100, b"0w#AAACGAACAAGTAACA", 1),
+                              (b"chrY", 70, 170, b"0w#AAACGAACAAGTAACA", 2),
+                              (b"chrY", 80, 190, b"0w#AAACGAACAAGTAACA", 1),
+                              (b"chrY", 85, 180, b"0w#AAACGAACAAGTAACA", 1),
+                              (b"chrY", 100, 190, b"0w#AAACGAACAAGTAACA", 3),
+                              (b"chrY", 10, 110, b"0w#AAACGAACAAGTAAGA", 1),
+                              (b"chrY", 50, 160, b"0w#AAACGAACAAGTAAGA", 2),
+                              (b"chrY", 100, 170, b"0w#AAACGAACAAGTAAGA", 3)
+                              ]
+        self.pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4")
+        self.pileup_v = np.array([3.0, 4.0, 6.0, 9.0, 11.0, 15.0, 19.0, 18.0, 16.0, 10.0, 6.0], dtype="f4")
+        self.peak_str = "chrom:chrY	start:80	end:180	name:peak_1	score:19	summit:105\n"
+        self.subset_barcodes = {b'0w#AAACGAACAAGTAACA', b"0w#AAACGAACAAGTAAGA"}
+        self.subset_pileup_p = np.array([10, 50, 70, 80, 85, 100, 110, 160, 170, 180, 190], dtype="i4")
+        self.subset_pileup_v = np.array([1.0, 2.0, 4.0, 6.0, 7.0, 8.0, 13.0, 12.0, 10.0, 5.0, 4.0], dtype="f4")
+        self.subset_peak_str = "chrom:chrY	start:100	end:170	name:peak_1	score:13	summit:105\n"
+
+        self.t = sum([(x[2]-x[1]) * x[4] for x in self.input_regions])
+
+    def test_add_frag(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        # roughly check the numbers...
+        self.assertEqual(pe.total, 22)
+        self.assertEqual(pe.length, self.t)
+
+        # subset
+        pe_subset = pe.subset(self.subset_barcodes)
+        # roughly check the numbers...
+        self.assertEqual(pe_subset.total, 14)
+        self.assertEqual(pe_subset.length, 1305)
+
+    def test_pileup(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        bdg = pe.pileup_bdg()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d[0], self.pileup_p)
+        np.testing.assert_array_equal(d[1], self.pileup_v)
+
+        pe_subset = pe.subset(self.subset_barcodes)
+        bdg = pe_subset.pileup_bdg()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d[0], self.subset_pileup_p)
+        np.testing.assert_array_equal(d[1], self.subset_pileup_v)
+
+    def test_pileup2(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        bdg = pe.pileup_bdg2()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d['p'], self.pileup_p)
+        np.testing.assert_array_equal(d['v'], self.pileup_v)
+
+        pe_subset = pe.subset(self.subset_barcodes)
+        bdg = pe_subset.pileup_bdg2()
+        d = bdg.get_data_by_chr(b'chrY')  # (p, v) of ndarray
+        np.testing.assert_array_equal(d['p'], self.subset_pileup_p)
+        np.testing.assert_array_equal(d['v'], self.subset_pileup_v)
+
+    def test_callpeak(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        bdg = pe.pileup_bdg()  # bedGraphTrackI object
+        peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10)
+        self.assertEqual(str(peaks), self.peak_str)
+
+        pe_subset = pe.subset(self.subset_barcodes)
+        bdg = pe_subset.pileup_bdg()
+        peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10)
+        self.assertEqual(str(peaks), self.subset_peak_str)
+
+    def test_callpeak2(self):
+        pe = PETrackII()
+        for (c, l, r, b, C) in self.input_regions:
+            pe.add_loc(c, l, r, b, C)
+        pe.finalize()
+        bdg = pe.pileup_bdg2()  # bedGraphTrackII object
+        peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10)
+        self.assertEqual(str(peaks), self.peak_str)
 
+        pe_subset = pe.subset(self.subset_barcodes)
+        bdg = pe_subset.pileup_bdg2()
+        peaks = bdg.call_peaks(cutoff=10, min_length=20, max_gap=10)
+        self.assertEqual(str(peaks), self.subset_peak_str)
diff --git a/test/test_Parser.py b/test/test_Parser.py
index 9c42b442..09c82c6d 100644
--- a/test/test_Parser.py
+++ b/test/test_Parser.py
@@ -1,32 +1,54 @@
 #!/usr/bin/env python
-# Time-stamp: <2019-12-12 14:42:28 taoliu>
+# Time-stamp: <2024-10-16 00:13:01 Tao Liu>
 
 import unittest
 
-from MACS3.IO.Parser import *
+from MACS3.IO.Parser import (guess_parser,
+                             BEDParser,
+                             SAMParser,
+                             BAMParser,
+                             FragParser)
 
-class Test_auto_guess ( unittest.TestCase ):
 
-    def setUp ( self ):
+class Test_auto_guess(unittest.TestCase):
+
+    def setUp(self):
         self.bedfile = "test/tiny.bed.gz"
         self.bedpefile = "test/tiny.bedpe.gz"
         self.samfile = "test/tiny.sam.gz"
         self.bamfile = "test/tiny.bam"
 
-    def test_guess_parser_bed ( self ):
-        p = guess_parser( self.bedfile )
-        self.assertTrue( p.is_gzipped() )
-        self.assertTrue( isinstance(p, BEDParser) )
-
-    def test_guess_parser_sam ( self ):
-        p = guess_parser( self.samfile )
-        self.assertTrue( p.is_gzipped() )
-        self.assertTrue( isinstance(p, SAMParser) )
+    def test_guess_parser_bed(self):
+        p = guess_parser(self.bedfile)
+        self.assertTrue(p.is_gzipped())
+        self.assertTrue(isinstance(p, BEDParser))
 
-    def test_guess_parser_bam ( self ):
-        p = guess_parser( self.bamfile )
-        self.assertTrue( p.is_gzipped() )
-        self.assertTrue( isinstance(p, BAMParser) )
+    def test_guess_parser_sam(self):
+        p = guess_parser(self.samfile)
+        self.assertTrue(p.is_gzipped())
+        self.assertTrue(isinstance(p, SAMParser))
 
+    def test_guess_parser_bam(self):
+        p = guess_parser(self.bamfile)
+        self.assertTrue(p.is_gzipped())
+        self.assertTrue(isinstance(p, BAMParser))
 
 
+class Test_parsing(unittest.TestCase):
+    def setUp(self):
+        self.bedfile = "test/tiny.bed.gz"
+        self.bedpefile = "test/tiny.bedpe.gz"
+        self.samfile = "test/tiny.sam.gz"
+        self.bamfile = "test/tiny.bam"
+        self.fragfile = "test/test.fragments.tsv.gz"
+
+    def test_fragment_file(self):
+        p = FragParser(self.fragfile)
+        petrack = p.build_petrack2()
+        petrack.finalize()
+        bdg = petrack.pileup_bdg()
+        bdg2 = petrack.pileup_bdg2()
+        peaks = bdg.call_peaks(cutoff=10, min_length=200, max_gap=100)
+        peaks2 = bdg2.call_peaks(cutoff=10, min_length=200, max_gap=100)
+        print(peaks)
+        print(peaks2)
diff --git a/test/test_PeakIO.py b/test/test_PeakIO.py
index a7af5a52..0a543f8a 100644
--- a/test/test_PeakIO.py
+++ b/test/test_PeakIO.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2022-09-14 13:33:37 Tao Liu>
+# Time-stamp: <2024-10-14 21:32:21 Tao Liu>
 
 import unittest
 import sys
@@ -49,7 +49,7 @@ def test_exclude(self):
         r1.exclude(r2)
         result = str(r1)
         expected = str(self.exclude2from1)
-        print( "result:\n",result )
-        print( "expected:\n", expected )
+        # print( "result:\n",result )
+        # print( "expected:\n", expected )
         self.assertEqual( result, expected )
 
diff --git a/test/test_Pileup.py b/test/test_Pileup.py
index 6abefa48..02b9b198 100644
--- a/test/test_Pileup.py
+++ b/test/test_Pileup.py
@@ -155,7 +155,7 @@ def test_pileup_1(self):
                                       self.param_1["extension"] )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -217,7 +217,7 @@ def test_max(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="max" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -233,7 +233,7 @@ def test_min(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="min" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
@@ -249,7 +249,7 @@ def test_mean(self):
         pileup = over_two_pv_array ( self.pv1, self.pv2, func="mean" )
         result = []
         (p,v) = pileup
-        print(p, v)
+        # print(p, v)
         pnext = iter(p).__next__
         vnext = iter(v).__next__
         pre = 0
diff --git a/test/test_ScoreTrack.py b/test/test_ScoreTrack.py
index 61eadc4e..7b7c340e 100644
--- a/test/test_ScoreTrack.py
+++ b/test/test_ScoreTrack.py
@@ -1,59 +1,62 @@
 #!/usr/bin/env python
-# Time-stamp: <2020-11-30 14:12:58 Tao Liu>
+# Time-stamp: <2024-10-18 15:30:21 Tao Liu>
 
 import io
 import unittest
-from numpy.testing import assert_equal,  assert_almost_equal, assert_array_equal
+from numpy.testing import assert_array_equal  # assert_equal, assert_almost_equal
 
-from MACS3.Signal.ScoreTrack import *
+import numpy as np
+from MACS3.Signal.ScoreTrack import ScoreTrackII, TwoConditionScores
 from MACS3.Signal.BedGraph import bedGraphTrackI
 
+
 class Test_TwoConditionScores(unittest.TestCase):
     def setUp(self):
         self.t1bdg = bedGraphTrackI()
         self.t2bdg = bedGraphTrackI()
         self.c1bdg = bedGraphTrackI()
         self.c2bdg = bedGraphTrackI()
-        self.test_regions1 = [(b"chrY",0,70,0.00,0.01),
-                              (b"chrY",70,80,7.00,0.5),
-                              (b"chrY",80,150,0.00,0.02)]
-        self.test_regions2 = [(b"chrY",0,75,20.0,4.00),
-                              (b"chrY",75,90,35.0,6.00),
-                              (b"chrY",90,150,10.0,15.00)]
+        self.test_regions1 = [(b"chrY", 0, 70, 0.00, 0.01),
+                              (b"chrY", 70, 80, 7.00, 0.5),
+                              (b"chrY", 80, 150, 0.00, 0.02)]
+        self.test_regions2 = [(b"chrY", 0, 75, 20.0, 4.00),
+                              (b"chrY", 75, 90, 35.0, 6.00),
+                              (b"chrY", 90, 150, 10.0, 15.00)]
         for a in self.test_regions1:
-            self.t1bdg.safe_add_loc(a[0],a[1],a[2],a[3])
-            self.c1bdg.safe_add_loc(a[0],a[1],a[2],a[4])
+            self.t1bdg.safe_add_loc(a[0], a[1], a[2], a[3])
+            self.c1bdg.safe_add_loc(a[0], a[1], a[2], a[4])
 
         for a in self.test_regions2:
-            self.t2bdg.safe_add_loc(a[0],a[1],a[2],a[3])
-            self.c2bdg.safe_add_loc(a[0],a[1],a[2],a[4])
-
-        self.twoconditionscore = TwoConditionScores( self.t1bdg,
-                                                     self.c1bdg,
-                                                     self.t2bdg,
-                                                     self.c2bdg,
-                                                     1.0,
-                                                     1.0 )
+            self.t2bdg.safe_add_loc(a[0], a[1], a[2], a[3])
+            self.c2bdg.safe_add_loc(a[0], a[1], a[2], a[4])
+
+        self.twoconditionscore = TwoConditionScores(self.t1bdg,
+                                                    self.c1bdg,
+                                                    self.t2bdg,
+                                                    self.c2bdg,
+                                                    1.0,
+                                                    1.0)
         self.twoconditionscore.build()
         self.twoconditionscore.finalize()
-        (self.cat1,self.cat2,self.cat3) = self.twoconditionscore.call_peaks(min_length=10, max_gap=10, cutoff=3)
+        (self.cat1, self.cat2, self.cat3) = self.twoconditionscore.call_peaks(min_length=10, max_gap=10, cutoff=3)
+
 
 class Test_ScoreTrackII(unittest.TestCase):
 
     def setUp(self):
         # for initiate scoretrack
-        self.test_regions1 = [(b"chrY",10,100,10),
-                              (b"chrY",60,10,10),
-                              (b"chrY",110,15,20),
-                              (b"chrY",160,5,20),
-                              (b"chrY",210,20,5)]
+        self.test_regions1 = [(b"chrY", 10, 100, 10),
+                              (b"chrY", 60, 10, 10),
+                              (b"chrY", 110, 15, 20),
+                              (b"chrY", 160, 5, 20),
+                              (b"chrY", 210, 20, 5)]
         self.treat_edm = 10
         self.ctrl_edm = 5
         # for different scoring method
-        self.p_result = [60.49, 0.38, 0.08, 0.0, 6.41] # -log10(p-value), pseudo count 1 added
-        self.q_result = [58.17, 0.0, 0.0, 0.0, 5.13] # -log10(q-value) from BH, pseudo count 1 added
-        self.l_result = [58.17, 0.0, -0.28, -3.25, 4.91] # log10 likelihood ratio, pseudo count 1 added
-        self.f_result = [0.96, 0.00, -0.12, -0.54, 0.54] # note, pseudo count 1 would be introduced.
+        self.p_result = [60.49, 0.38, 0.08, 0.0, 6.41]  # -log10(p-value), pseudo count 1 added
+        self.q_result = [58.17, 0.0, 0.0, 0.0, 5.13]  # -log10(q-value) from BH, pseudo count 1 added
+        self.l_result = [58.17, 0.0, -0.28, -3.25, 4.91]  # log10 likelihood ratio, pseudo count 1 added
+        self.f_result = [0.96, 0.00, -0.12, -0.54, 0.54]  # note, pseudo count 1 would be introduced.
         self.d_result = [90.00, 0, -5.00, -15.00, 15.00]
         self.m_result = [10.00, 1.00, 1.50, 0.50, 2.00]
         # for norm
@@ -96,109 +99,108 @@ def setUp(self):
 chrY	160	210	6.40804
 """
         # for peak calls
-        self.peak1 = """chrY	0	60	peak_1	60.4891
-chrY	160	210	peak_2	6.40804
+        self.peak1 = """chrY	0	60	MACS_peak_1	60.4891
+chrY	160	210	MACS_peak_2	6.40804
 """
-        self.summit1 = """chrY	5	6	peak_1	60.4891
-chrY	185	186	peak_2	6.40804
+        self.summit1 = """chrY	5	6	MACS_peak_1	60.4891
+chrY	185	186	MACS_peak_2	6.40804
 """
         self.xls1    ="""chr	start	end	length	abs_summit	pileup	-log10(pvalue)	fold_enrichment	-log10(qvalue)	name
 chrY	1	60	60	6	100	63.2725	9.18182	-1	MACS_peak_1
 chrY	161	210	50	186	20	7.09102	3.5	-1	MACS_peak_2
 """
 
-    def assertListAlmostEqual ( self, a, b, places =2 ):
-        return all( [self.assertAlmostEqual(x, y, places=places) for (x, y) in zip( a, b)] )
+    def assertListAlmostEqual(self, a, b, places=2):
+        return all([self.assertAlmostEqual(x, y, places=places) for (x, y) in zip(a, b)])
 
     def test_compute_scores(self):
-        s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm )
-        s1.add_chromosome( b"chrY", 5 )
+        s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm)
+        s1.add_chromosome(b"chrY", 5)
         for a in self.test_regions1:
-            s1.add( a[0],a[1],a[2],a[3] )
+            s1.add(a[0], a[1], a[2], a[3])
 
-        s1.set_pseudocount ( 1.0 )
+        s1.set_pseudocount(1.0)
 
-        s1.change_score_method( ord('p') )
+        s1.change_score_method(ord('p'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in r[3]], self.p_result )
+        self.assertListAlmostEqual([round(x, 2) for x in r[3]], self.p_result)
 
-        s1.change_score_method( ord('q') )
+        s1.change_score_method(ord('q'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.q_result )
+        self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.q_result)
 
-        s1.change_score_method( ord('l') )
+        s1.change_score_method(ord('l'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.l_result )
+        self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.l_result)
 
-        s1.change_score_method( ord('f') )
+        s1.change_score_method(ord('f'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.f_result )
+        self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.f_result)
 
-        s1.change_score_method( ord('d') )
+        s1.change_score_method(ord('d'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.d_result )
+        self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.d_result)
 
-        s1.change_score_method( ord('m') )
+        s1.change_score_method(ord('m'))
         r = s1.get_data_by_chr(b"chrY")
-        self.assertListAlmostEqual( [round(x,2) for x in list(r[3])], self.m_result )
+        self.assertListAlmostEqual([round(x, 2) for x in list(r[3])], self.m_result)
 
     def test_normalize(self):
-        s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm )
-        s1.add_chromosome( b"chrY", 5 )
+        s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm)
+        s1.add_chromosome(b"chrY", 5)
         for a in self.test_regions1:
-            s1.add( a[0],a[1],a[2],a[3] )
+            s1.add(a[0], a[1], a[2], a[3])
 
-        s1.change_normalization_method( ord('T') )
+        s1.change_normalization_method(ord('T'))
         r = s1.get_data_by_chr(b"chrY")
-        assert_array_equal( r, self.norm_T )
+        assert_array_equal(r, self.norm_T)
 
-        s1.change_normalization_method( ord('C') )
+        s1.change_normalization_method(ord('C'))
         r = s1.get_data_by_chr(b"chrY")
-        assert_array_equal( r, self.norm_C )
+        assert_array_equal(r, self.norm_C)
 
-        s1.change_normalization_method( ord('M') )
+        s1.change_normalization_method(ord('M'))
         r = s1.get_data_by_chr(b"chrY")
-        assert_array_equal( r, self.norm_M )
+        assert_array_equal(r, self.norm_M)
 
-        s1.change_normalization_method( ord('N') )
+        s1.change_normalization_method(ord('N'))
         r = s1.get_data_by_chr(b"chrY")
-        assert_array_equal( r, self.norm_N )
+        assert_array_equal(r, self.norm_N)
 
-    def test_writebedgraph ( self ):
-        s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm )
-        s1.add_chromosome( b"chrY", 5 )
+    def test_writebedgraph(self):
+        s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm)
+        s1.add_chromosome(b"chrY", 5)
         for a in self.test_regions1:
-            s1.add( a[0],a[1],a[2],a[3] )
+            s1.add(a[0], a[1], a[2], a[3])
 
-        s1.change_score_method( ord('p') )
+        s1.change_score_method(ord('p'))
 
         strio = io.StringIO()
-        s1.write_bedGraph( strio, "NAME", "DESC", 1 )
-        self.assertEqual( strio.getvalue(), self.bdg1 )
+        s1.write_bedGraph(strio, "NAME", "DESC", 1)
+        self.assertEqual(strio.getvalue(), self.bdg1)
         strio = io.StringIO()
-        s1.write_bedGraph( strio, "NAME", "DESC", 2 )
-        self.assertEqual( strio.getvalue(), self.bdg2 )
+        s1.write_bedGraph(strio, "NAME", "DESC", 2)
+        self.assertEqual(strio.getvalue(), self.bdg2)
         strio = io.StringIO()
-        s1.write_bedGraph( strio, "NAME", "DESC", 3 )
-        self.assertEqual( strio.getvalue(), self.bdg3 )
+        s1.write_bedGraph(strio, "NAME", "DESC", 3)
+        self.assertEqual(strio.getvalue(), self.bdg3)
 
-    def test_callpeak ( self ):
-        s1 = ScoreTrackII( self.treat_edm, self.ctrl_edm )
-        s1.add_chromosome( b"chrY", 5 )
+    def test_callpeak(self):
+        s1 = ScoreTrackII(self.treat_edm, self.ctrl_edm)
+        s1.add_chromosome(b"chrY", 5)
         for a in self.test_regions1:
-            s1.add( a[0],a[1],a[2],a[3] )
+            s1.add(a[0], a[1], a[2], a[3])
 
-        s1.change_score_method( ord('p') )
-        p = s1.call_peaks( cutoff = 0.10, min_length=10, max_gap=10 )
+        s1.change_score_method(ord('p'))
+        p = s1.call_peaks(cutoff=0.10, min_length=10, max_gap=10)
         strio = io.StringIO()
-        p.write_to_bed( strio, trackline = False )
-        self.assertEqual( strio.getvalue(), self.peak1 )
+        p.write_to_bed(strio, trackline=False)
+        self.assertEqual(strio.getvalue(), self.peak1)
 
         strio = io.StringIO()
-        p.write_to_summit_bed( strio, trackline = False )
-        self.assertEqual( strio.getvalue(), self.summit1 )
+        p.write_to_summit_bed(strio, trackline=False)
+        self.assertEqual(strio.getvalue(), self.summit1)
 
         strio = io.StringIO()
-        p.write_to_xls( strio )
-        self.assertEqual( strio.getvalue(), self.xls1 )
-
+        p.write_to_xls(strio)
+        self.assertEqual(strio.getvalue(), self.xls1)