From 593da0dca93e11eea3ade6980f26dfa2003050a1 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Thu, 30 Jun 2022 23:02:18 -0400 Subject: [PATCH 01/12] Column with annotated mismatched in parse/parse2 --- pairtools/cli/parse.py | 2 ++ pairtools/cli/parse2.py | 1 + pairtools/lib/parse.py | 17 ++++++++++++++++- pairtools/lib/parse_pysam.pyx | 32 +++++++++++++++++++++++++++++++- 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/pairtools/cli/parse.py b/pairtools/cli/parse.py index 567e5e2f..f36929c6 100644 --- a/pairtools/cli/parse.py +++ b/pairtools/cli/parse.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + import click import sys @@ -25,6 +26,7 @@ "dist_to_5", "dist_to_3", "seq", + "mismatches", # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" ] diff --git a/pairtools/cli/parse2.py b/pairtools/cli/parse2.py index 37fb6cdd..37a99504 100644 --- a/pairtools/cli/parse2.py +++ b/pairtools/cli/parse2.py @@ -25,6 +25,7 @@ "dist_to_5", "dist_to_3", "seq", + "mismatches" # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" ] diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index ce0a1a27..76503679 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -35,7 +35,7 @@ """ from . import pairsam_format - +from .parse_pysam import get_mismatches_c def streaming_classify( instream, outstream, chromosomes, out_alignments_stream, out_stat, **kwargs @@ -240,6 +240,7 @@ def empty_alignment(): "clip5_ref": 0, "read_len": 0, "type": "N", + "mismatches": "" } @@ -282,11 +283,23 @@ def parse_pysam_entry( # Note that pysam output is zero-based, thus add +1: pos3 = sam.reference_start + 1 + # Get number of matches: + if not sam.has_tag("MD"): + mismatches = "" + else: + seq = sam.query_sequence.upper() + quals = sam.query_qualities + aligned_pairs = sam.get_aligned_pairs(with_seq=True, matches_only=True) + mismatches = get_mismatches_c(seq, quals, aligned_pairs) + mismatches = ",".join([f"{original}:{mutated}:{phred}:{ref}:{read}" for original, mutated, phred, ref, read in mismatches]) + #n_matches = len(aligned_pairs) + else: chrom = pairsam_format.UNMAPPED_CHROM strand = pairsam_format.UNMAPPED_STRAND pos5 = pairsam_format.UNMAPPED_POS pos3 = pairsam_format.UNMAPPED_POS + mismatches = "" else: chrom = pairsam_format.UNMAPPED_CHROM strand = pairsam_format.UNMAPPED_STRAND @@ -295,6 +308,7 @@ def parse_pysam_entry( dist_to_5 = 0 dist_to_3 = 0 + mismatches = "" algn = { "chrom": chrom, @@ -308,6 +322,7 @@ def parse_pysam_entry( "dist_to_5": dist_to_5, "dist_to_3": dist_to_3, "type": ("N" if not is_mapped else ("M" if not is_unique else "U")), + "mismatches": mismatches } algn.update(cigar) diff --git a/pairtools/lib/parse_pysam.pyx b/pairtools/lib/parse_pysam.pyx index a65d3ba6..efdc6dfe 100644 --- a/pairtools/lib/parse_pysam.pyx +++ b/pairtools/lib/parse_pysam.pyx @@ -91,5 +91,35 @@ cdef class AlignedSegmentPairtoolized(AlignedSegment): "algn_ref_span": algn_ref_span, "algn_read_span": algn_read_span, "read_len": read_len, - "matched_bp": matched_bp, + "matched_bp": matched_bp } + + +from cpython cimport array +import cython +cimport cython + +cpdef list get_mismatches_c(str seq, array.array quals, list aligned_pairs): + ''' + This function takes a SAM alignment and, for every mismatch between the read and reference sequences, + returns a tuple (the reference bp, the read bp, PHRED quality of the bp, reference position, read position). + + Reference: https://github.com/gerlichlab/scshic_pipeline/blob/master/bin/seq_mismatches.pyx + ''' + + cdef cython.int read_pos, ref_pos + cdef str orig_bp, orig_bp_upper + cdef list mismatches = [] + + for read_pos, ref_pos, orig_bp in aligned_pairs: + orig_bp_upper = orig_bp.upper() + if (seq[read_pos] != orig_bp_upper): + mismatches.append( + (orig_bp_upper, + seq[read_pos], + quals[read_pos], + ref_pos, + read_pos) + ) + + return mismatches \ No newline at end of file From 513d561f6e1f3b9478c6eb06c7ea92939da97314 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Fri, 1 Jul 2022 16:20:44 -0400 Subject: [PATCH 02/12] extra columns update in the docs; EXTRA_COLUMNS moved to pairsam_format. --- doc/formats.rst | 35 +++++++++++++++++++++++++++++++++ pairtools/cli/parse.py | 19 ++---------------- pairtools/cli/parse2.py | 20 ++----------------- pairtools/lib/pairsam_format.py | 15 ++++++++++++++ 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/doc/formats.rst b/doc/formats.rst index 8ca364d4..2c1a1c1e 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -148,3 +148,38 @@ Finally, sam1 and sam2 can store multiple .sam alignments, separated by a string .. |check| unicode:: U+2714 .. check .. |cross| unicode:: U+274C .. cross +Extra columns +---------------- + +`pairtools` can operate on `.pairs/.pairsam` with extra columns. +Extra columns are specified in the order defined by the order their addition by various tools. +Column names can be checked in the header of `.pairs/.pairsam` file. +We provide `pairtools header` utilities for manipulating and verifying compatibility of headers and their columns. + +The list of additional columns used thoughout `pairtools` modules: + +============== ============================ ================ ============== ======================================================= ============= +extra column module that adds column format how to add it description examples +============== ============================ ================ ============== ======================================================= ============= +mapq `pairtools parse/parse2` +pos5 `pairtools parse/parse2` +pos3 `pairtools parse/parse2` +cigar `pairtools parse/parse2` +read_len `pairtools parse/parse2` +matched_bp `pairtools parse/parse2` +algn_ref_span `pairtools parse/parse2` +algn_read_span `pairtools parse/parse2` +dist_to_5 `pairtools parse/parse2` +dist_to_3 `pairtools parse/parse2` +seq `pairtools parse/parse2` +mismatches `pairtools parse/parse2` +XB,AS,XS or any sam flag `pairtools parse/parse2` +walk_pair_type `pairtools parse/parse2` +walk_pair_index `pairtools parse/parse2` +phase `pairtools phase` +rfrag1 `pairtools restrict` +rfrag_start1 `pairtools restrict` +rfrag_end1 `pairtools restrict` +rfrag2 `pairtools restrict` +rfrag_start2 `pairtools restrict` +rfrag_end2 `pairtools restrict` integer number run restrict in .pairs file coordinate of the end of restriction fragment \ No newline at end of file diff --git a/pairtools/cli/parse.py b/pairtools/cli/parse.py index f36929c6..fc34cc01 100644 --- a/pairtools/cli/parse.py +++ b/pairtools/cli/parse.py @@ -14,21 +14,6 @@ UTIL_NAME = "pairtools_parse" -EXTRA_COLUMNS = [ - "mapq", - "pos5", - "pos3", - "cigar", - "read_len", - "matched_bp", - "algn_ref_span", - "algn_read_span", - "dist_to_5", - "dist_to_3", - "seq", - "mismatches", # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" -] - @cli.command() @click.argument("sam_path", type=str, required=False) @@ -98,7 +83,7 @@ help="Report extra columns describing alignments " "Possible values (can take multiple values as a comma-separated " "list): a SAM tag (any pair of uppercase letters) or {}.".format( - ", ".join(EXTRA_COLUMNS) + ", ".join(pairsam_format.EXTRA_COLUMNS) ), ) @click.option( @@ -230,7 +215,7 @@ def parse_py( add_columns = kwargs.get("add_columns", []) add_columns = [col for col in add_columns.split(",") if col] for col in add_columns: - if not ((col in EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): + if not ((col in pairsam_format.EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): raise Exception("{} is not a valid extra column".format(col)) columns = pairsam_format.COLUMNS + ( diff --git a/pairtools/cli/parse2.py b/pairtools/cli/parse2.py index 37a99504..1743d123 100644 --- a/pairtools/cli/parse2.py +++ b/pairtools/cli/parse2.py @@ -13,22 +13,6 @@ UTIL_NAME = "pairtools_parse2" -EXTRA_COLUMNS = [ - "mapq", - "pos5", - "pos3", - "cigar", - "read_len", - "matched_bp", - "algn_ref_span", - "algn_read_span", - "dist_to_5", - "dist_to_3", - "seq", - "mismatches" # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" -] - - @cli.command() @click.argument("sam_path", type=str, required=False) # Parsing options: @@ -193,7 +177,7 @@ help="Report extra columns describing alignments " "Possible values (can take multiple values as a comma-separated " "list): a SAM tag (any pair of uppercase letters) or {}.".format( - ", ".join(EXTRA_COLUMNS) + ", ".join(pairsam_format.EXTRA_COLUMNS) ), ) @click.option( @@ -282,7 +266,7 @@ def parse2_py( add_columns = kwargs.get("add_columns", []) add_columns = [col for col in add_columns.split(",") if col] for col in add_columns: - if not ((col in EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): + if not ((col in pairsam_format.EXTRA_COLUMNS) or (len(col) == 2 and col.isupper())): raise Exception("{} is not a valid extra column".format(col)) columns = pairsam_format.COLUMNS + ( diff --git a/pairtools/lib/pairsam_format.py b/pairtools/lib/pairsam_format.py index dfbd95ec..96c13f01 100644 --- a/pairtools/lib/pairsam_format.py +++ b/pairtools/lib/pairsam_format.py @@ -62,3 +62,18 @@ UNMAPPED_STRAND = "-" UNANNOTATED_RFRAG = -1 + +EXTRA_COLUMNS = [ + "mapq", + "pos5", + "pos3", + "cigar", + "read_len", + "matched_bp", + "algn_ref_span", + "algn_read_span", + "dist_to_5", + "dist_to_3", + "seq", + "mismatches" # Format: "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" +] \ No newline at end of file From ec15514844fdce6240a65aef3cb242ded1f7f67a Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Fri, 1 Jul 2022 16:22:22 -0400 Subject: [PATCH 03/12] draft docs on extra columns update --- doc/formats.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/formats.rst b/doc/formats.rst index 2c1a1c1e..222a6626 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -158,6 +158,7 @@ We provide `pairtools header` utilities for manipulating and verifying compatibi The list of additional columns used thoughout `pairtools` modules: + ============== ============================ ================ ============== ======================================================= ============= extra column module that adds column format how to add it description examples ============== ============================ ================ ============== ======================================================= ============= @@ -182,4 +183,5 @@ rfrag_start1 `pairtools restrict` rfrag_end1 `pairtools restrict` rfrag2 `pairtools restrict` rfrag_start2 `pairtools restrict` -rfrag_end2 `pairtools restrict` integer number run restrict in .pairs file coordinate of the end of restriction fragment \ No newline at end of file +rfrag_end2 `pairtools restrict` integer number run restrict in .pairs file coordinate of the end of restriction fragment +============== ============================ ================ ============== ======================================================= ============= From ddf6c327f4614283dcd2ff3bbcd66e8d83035ef8 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Fri, 1 Jul 2022 16:25:35 -0400 Subject: [PATCH 04/12] draft docs on extra columns update --- doc/formats.rst | 51 +++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/doc/formats.rst b/doc/formats.rst index 222a6626..ed5bfe23 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -159,29 +159,30 @@ We provide `pairtools header` utilities for manipulating and verifying compatibi The list of additional columns used thoughout `pairtools` modules: -============== ============================ ================ ============== ======================================================= ============= -extra column module that adds column format how to add it description examples -============== ============================ ================ ============== ======================================================= ============= -mapq `pairtools parse/parse2` -pos5 `pairtools parse/parse2` -pos3 `pairtools parse/parse2` -cigar `pairtools parse/parse2` -read_len `pairtools parse/parse2` -matched_bp `pairtools parse/parse2` -algn_ref_span `pairtools parse/parse2` -algn_read_span `pairtools parse/parse2` -dist_to_5 `pairtools parse/parse2` -dist_to_3 `pairtools parse/parse2` -seq `pairtools parse/parse2` -mismatches `pairtools parse/parse2` +========================= ============================ ================ ===================== ======================================================= ============= +extra column module that adds column format how to add it description examples +========================= ============================ ================ ===================== ======================================================= ============= +rfrag_end2 `pairtools restrict` integer number `restrict` on .pairs coordinate of the end of restriction fragment . +========================= ============================ ================ ===================== ======================================================= ============= + +mapq `pairtools parse/parse2` . +pos5 `pairtools parse/parse2` . +pos3 `pairtools parse/parse2` . +cigar `pairtools parse/parse2` +read_len `pairtools parse/parse2` +matched_bp `pairtools parse/parse2` +algn_ref_span `pairtools parse/parse2` +algn_read_span `pairtools parse/parse2` +dist_to_5 `pairtools parse/parse2` +dist_to_3 `pairtools parse/parse2` +seq `pairtools parse/parse2` +mismatches `pairtools parse/parse2` XB,AS,XS or any sam flag `pairtools parse/parse2` -walk_pair_type `pairtools parse/parse2` -walk_pair_index `pairtools parse/parse2` -phase `pairtools phase` -rfrag1 `pairtools restrict` -rfrag_start1 `pairtools restrict` -rfrag_end1 `pairtools restrict` -rfrag2 `pairtools restrict` -rfrag_start2 `pairtools restrict` -rfrag_end2 `pairtools restrict` integer number run restrict in .pairs file coordinate of the end of restriction fragment -============== ============================ ================ ============== ======================================================= ============= +walk_pair_type `pairtools parse/parse2` +walk_pair_index `pairtools parse/parse2` +phase `pairtools phase` +rfrag1 `pairtools restrict` +rfrag_start1 `pairtools restrict` +rfrag_end1 `pairtools restrict` +rfrag2 `pairtools restrict` +rfrag_start2 `pairtools restrict` From 57a118ed82f4b77a5713baf3de4b8c9e780fb7a3 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Sat, 2 Jul 2022 14:51:04 -0400 Subject: [PATCH 05/12] docs update, python3.10 version update of channels --- .github/workflows/python-package.yml | 2 +- doc/formats.rst | 50 +++++++++++++--------------- doc/parsing.rst | 2 +- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1f23b782..2cf8503d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -48,7 +48,7 @@ jobs: conda info -a # Create test environment and install deps - conda create -q -n test-environment python=${{ matrix.python-version }} setuptools pip cython numpy pandas nose samtools pysam scipy pyyaml bioframe + conda create -q -n test-environment -c conda-forge -c bioconda python=${{ matrix.python-version }} setuptools pip cython numpy pandas nose samtools pysam scipy pyyaml bioframe source activate test-environment pip install click python setup.py build_ext -i diff --git a/doc/formats.rst b/doc/formats.rst index ed5bfe23..caea3a08 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -159,30 +159,26 @@ We provide `pairtools header` utilities for manipulating and verifying compatibi The list of additional columns used thoughout `pairtools` modules: -========================= ============================ ================ ===================== ======================================================= ============= -extra column module that adds column format how to add it description examples -========================= ============================ ================ ===================== ======================================================= ============= -rfrag_end2 `pairtools restrict` integer number `restrict` on .pairs coordinate of the end of restriction fragment . -========================= ============================ ================ ===================== ======================================================= ============= - -mapq `pairtools parse/parse2` . -pos5 `pairtools parse/parse2` . -pos3 `pairtools parse/parse2` . -cigar `pairtools parse/parse2` -read_len `pairtools parse/parse2` -matched_bp `pairtools parse/parse2` -algn_ref_span `pairtools parse/parse2` -algn_read_span `pairtools parse/parse2` -dist_to_5 `pairtools parse/parse2` -dist_to_3 `pairtools parse/parse2` -seq `pairtools parse/parse2` -mismatches `pairtools parse/parse2` -XB,AS,XS or any sam flag `pairtools parse/parse2` -walk_pair_type `pairtools parse/parse2` -walk_pair_index `pairtools parse/parse2` -phase `pairtools phase` -rfrag1 `pairtools restrict` -rfrag_start1 `pairtools restrict` -rfrag_end1 `pairtools restrict` -rfrag2 `pairtools restrict` -rfrag_start2 `pairtools restrict` +===== | ===== | ===== | ===== | ===== | ===== +extra column | generating module | format | how to add it | description | examples +===== | ===== | ===== | ===== | ===== | ===== +mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong)}$ | +pos51, pos52 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos5` | 5' position of alignment (closer to read start) | +pos31, pos32 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos3` | 3' position of alignment (further from read start) | +cigar1, cigar2 | `parse/parse2` | string | `pairtools parse --add-columns cigar` | [CIGAR, or Compact Idiosyncratic Gapped Alignment Report](https://en.wikipedia.org/wiki/Sequence_alignment#CIGAR_Format) of alignment, as reported in .sam/.bam | +read_len1, read_len2 | `parse/parse2` | number | `pairtools parse --add-columns read_len` | read length | +matched_bp1, matched_bp2 | `parse/parse2` | number | `pairtools parse --add-columns matched_bp` | number of matched alignment basepairs to the reference | +algn_ref_span1, algn_ref_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_ref_span` | basepairs of reference covered by alignment | +algn_read_span1, algn_read_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_read_span` | basepairs of read covered by alignment | +dist_to_51, dist_to_52 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_5` | distance to 5'-end of read | +dist_to_31, dist_to_32 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_3` | distance to 3'-end of read | +seq1, seq2 | `parse/parse2` | string | `pairtools parse --add-columns seq` | sequence of alignment | +mismatches1, mismatches2 | `parse/parse2` | string | `pairtools parse --add-columns mismatches` | comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" | +XB1/2,AS1/2,XS1/2 or any sam tag | `parse/parse2` | format depends on [tag specification](https://samtools.github.io/hts-specs/SAMv1.pdf) | `pairtools parse --add-columns XA,XB,NM` | | +walk_pair_type | `parse/parse2` | string | `pairtools parse2 --add-pair-index` | Type of the pair relative to R1 and R2 reads of paired-end sequencing, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | +walk_pair_index | `parse/parse2` | number | `pairtools parse2 --add-pair-index` | Order of the pair in the complex walk, starting from 5'-end of left read, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | +phase | `phase` | 0, 1 or "." | `pairtools phase` | Phase of alignment (haplotype 1, 2, on unphased), see [phasing walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html) | +rfrag1, rfrag2 | `restrict` | number | `pairtools restrict` | Unique index of the restriction fragment after annotating pairs positions, see [restriction walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_restrict_walkthrough.html) | +rfrag_start1, rfrag_start2 | `restrict` | number | `pairtools restrict` | Coordinate of the start of restriction fragment +rfrag_end1, rfrag_end2 | `restrict` | number | `pairtools restrict` | Coordinate of the end of restriction fragment +===== | ===== | ===== | ===== | ===== | ===== \ No newline at end of file diff --git a/doc/parsing.rst b/doc/parsing.rst index b38f04a9..d7d63935 100644 --- a/doc/parsing.rst +++ b/doc/parsing.rst @@ -265,7 +265,7 @@ position: Sometimes it is important to restore the sequence of ligation events (e.g., for MC-3C data). For that, you can add special columns ``walk_pair_index`` and ``walk_pair_type`` by setting ``--add-pair-index`` option of ``parse2``, that will keep the order and type of pair in the whole walk in the output .pairs file. -- ``walk_pair_index`` contains information on the order of the pair in the recovered walk, starting from 5'-end of left read +- ``walk_pair_index`` contains information on the order of the pair in the complex walk, starting from 5'-end of left read - ``walk_pair_type`` describes the type of the pair relative to R1 and R2 reads of paired-end sequencing: - "R1-2" - unconfirmed pair, right and left alignments in the pair originate from different reads (left or right). This might be indirect ligation (mediated by other DNA fragments). From 583d6ef9f49fda2285e80cc95c31658abed8dd0e Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Sat, 2 Jul 2022 15:09:49 -0400 Subject: [PATCH 06/12] docs update, python3.10 version update of channels --- .github/workflows/python-package.yml | 4 +-- doc/formats.rst | 51 ++++++++++++++-------------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 2cf8503d..ae37fe0d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -39,7 +39,6 @@ jobs: hash -r conda config --set always_yes yes --set changeps1 no conda config --add channels conda-forge - conda config --add channels defaults conda config --add channels bioconda conda config --get @@ -48,8 +47,9 @@ jobs: conda info -a # Create test environment and install deps - conda create -q -n test-environment -c conda-forge -c bioconda python=${{ matrix.python-version }} setuptools pip cython numpy pandas nose samtools pysam scipy pyyaml bioframe + conda create -q -n test-environment -c anaconda python=${{ matrix.python-version }} setuptools pip numpy pandas nose source activate test-environment + conda install -c conda-forge -c bioconda cython samtools pysam scipy pyyaml bioframe pip install click python setup.py build_ext -i diff --git a/doc/formats.rst b/doc/formats.rst index caea3a08..1ec88f39 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -156,29 +156,28 @@ Extra columns are specified in the order defined by the order their addition by Column names can be checked in the header of `.pairs/.pairsam` file. We provide `pairtools header` utilities for manipulating and verifying compatibility of headers and their columns. -The list of additional columns used thoughout `pairtools` modules: - - -===== | ===== | ===== | ===== | ===== | ===== -extra column | generating module | format | how to add it | description | examples -===== | ===== | ===== | ===== | ===== | ===== -mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong)}$ | -pos51, pos52 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos5` | 5' position of alignment (closer to read start) | -pos31, pos32 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos3` | 3' position of alignment (further from read start) | -cigar1, cigar2 | `parse/parse2` | string | `pairtools parse --add-columns cigar` | [CIGAR, or Compact Idiosyncratic Gapped Alignment Report](https://en.wikipedia.org/wiki/Sequence_alignment#CIGAR_Format) of alignment, as reported in .sam/.bam | -read_len1, read_len2 | `parse/parse2` | number | `pairtools parse --add-columns read_len` | read length | -matched_bp1, matched_bp2 | `parse/parse2` | number | `pairtools parse --add-columns matched_bp` | number of matched alignment basepairs to the reference | -algn_ref_span1, algn_ref_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_ref_span` | basepairs of reference covered by alignment | -algn_read_span1, algn_read_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_read_span` | basepairs of read covered by alignment | -dist_to_51, dist_to_52 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_5` | distance to 5'-end of read | -dist_to_31, dist_to_32 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_3` | distance to 3'-end of read | -seq1, seq2 | `parse/parse2` | string | `pairtools parse --add-columns seq` | sequence of alignment | -mismatches1, mismatches2 | `parse/parse2` | string | `pairtools parse --add-columns mismatches` | comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" | -XB1/2,AS1/2,XS1/2 or any sam tag | `parse/parse2` | format depends on [tag specification](https://samtools.github.io/hts-specs/SAMv1.pdf) | `pairtools parse --add-columns XA,XB,NM` | | -walk_pair_type | `parse/parse2` | string | `pairtools parse2 --add-pair-index` | Type of the pair relative to R1 and R2 reads of paired-end sequencing, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | -walk_pair_index | `parse/parse2` | number | `pairtools parse2 --add-pair-index` | Order of the pair in the complex walk, starting from 5'-end of left read, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | -phase | `phase` | 0, 1 or "." | `pairtools phase` | Phase of alignment (haplotype 1, 2, on unphased), see [phasing walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html) | -rfrag1, rfrag2 | `restrict` | number | `pairtools restrict` | Unique index of the restriction fragment after annotating pairs positions, see [restriction walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_restrict_walkthrough.html) | -rfrag_start1, rfrag_start2 | `restrict` | number | `pairtools restrict` | Coordinate of the start of restriction fragment -rfrag_end1, rfrag_end2 | `restrict` | number | `pairtools restrict` | Coordinate of the end of restriction fragment -===== | ===== | ===== | ===== | ===== | ===== \ No newline at end of file +The list of additional columns used throughout `pairtools` modules: + ++=====+=====+=====+=====+=====+=====+ +| extra column | generating module | format | how to add it | description | ++=====+=====+=====+=====+=====+=====+ +| mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong)}$ | +| pos51, pos52 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos5` | 5' position of alignment (closer to read start) | +| pos31, pos32 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos3` | 3' position of alignment (further from read start) | +| cigar1, cigar2 | `parse/parse2` | string | `pairtools parse --add-columns cigar` | [CIGAR, or Compact Idiosyncratic Gapped Alignment Report](https://en.wikipedia.org/wiki/Sequence_alignment#CIGAR_Format) of alignment, as reported in .sam/.bam | +| read_len1, read_len2 | `parse/parse2` | number | `pairtools parse --add-columns read_len` | read length | +| matched_bp1, matched_bp2 | `parse/parse2` | number | `pairtools parse --add-columns matched_bp` | number of matched alignment basepairs to the reference | +| algn_ref_span1, algn_ref_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_ref_span` | basepairs of reference covered by alignment | +| algn_read_span1, algn_read_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_read_span` | basepairs of read covered by alignment | +| dist_to_51, dist_to_52 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_5` | distance to 5'-end of read | +| dist_to_31, dist_to_32 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_3` | distance to 3'-end of read | +| seq1, seq2 | `parse/parse2` | string | `pairtools parse --add-columns seq` | sequence of alignment | +| mismatches1, mismatches2 | `parse/parse2` | string | `pairtools parse --add-columns mismatches` | comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" | +| XB1/2,AS1/2,XS1/2 or any sam tag | `parse/parse2` | format depends on [tag specification](https://samtools.github.io/hts-specs/SAMv1.pdf) | `pairtools parse --add-columns XA,XB,NM` | +| walk_pair_type | `parse/parse2` | string | `pairtools parse2 --add-pair-index` | Type of the pair relative to R1 and R2 reads of paired-end sequencing, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | +| walk_pair_index | `parse/parse2` | number | `pairtools parse2 --add-pair-index` | Order of the pair in the complex walk, starting from 5'-end of left read, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | +| phase | `phase` | 0, 1 or "." | `pairtools phase` | Phase of alignment (haplotype 1, 2, on unphased), see [phasing walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html) | +| rfrag1, rfrag2 | `restrict` | number | `pairtools restrict` | Unique index of the restriction fragment after annotating pairs positions, see [restriction walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_restrict_walkthrough.html) | +| rfrag_start1, rfrag_start2 | `restrict` | number | `pairtools restrict` | Coordinate of the start of restriction fragment | +| rfrag_end1, rfrag_end2 | `restrict` | number | `pairtools restrict` | Coordinate of the end of restriction fragment | ++=====+=====+=====+=====+=====+=====+ From 54958c685e1e3a964548d692d46bcae248fdf94c Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Sun, 3 Jul 2022 00:06:22 -0400 Subject: [PATCH 07/12] docs update, python3.10 version update of channels --- .github/workflows/python-package.yml | 5 +++-- doc/formats.rst | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ae37fe0d..ff6aeee0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -47,9 +47,10 @@ jobs: conda info -a # Create test environment and install deps - conda create -q -n test-environment -c anaconda python=${{ matrix.python-version }} setuptools pip numpy pandas nose + conda create -q -n test-environment -c anaconda python=${{ matrix.python-version }} setuptools pip numpy scipy pandas nose source activate test-environment - conda install -c conda-forge -c bioconda cython samtools pysam scipy pyyaml bioframe + conda install -c conda-forge cython pyyaml + conda install -c conda-forge -c bioconda samtools pysam bioframe pip install click python setup.py build_ext -i diff --git a/doc/formats.rst b/doc/formats.rst index 1ec88f39..964a7194 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -161,7 +161,7 @@ The list of additional columns used throughout `pairtools` modules: +=====+=====+=====+=====+=====+=====+ | extra column | generating module | format | how to add it | description | +=====+=====+=====+=====+=====+=====+ -| mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong)}$ | +| mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong))$ | | pos51, pos52 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos5` | 5' position of alignment (closer to read start) | | pos31, pos32 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos3` | 3' position of alignment (further from read start) | | cigar1, cigar2 | `parse/parse2` | string | `pairtools parse --add-columns cigar` | [CIGAR, or Compact Idiosyncratic Gapped Alignment Report](https://en.wikipedia.org/wiki/Sequence_alignment#CIGAR_Format) of alignment, as reported in .sam/.bam | From 532dde107aac48982fb9e802853013d737c2cfc3 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Sun, 3 Jul 2022 15:04:07 -0400 Subject: [PATCH 08/12] docs update, python3.10 version update of channels --- .github/workflows/python-package.yml | 3 +- doc/formats.rst | 47 ++++++++++++++-------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ff6aeee0..9d28e777 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -50,7 +50,8 @@ jobs: conda create -q -n test-environment -c anaconda python=${{ matrix.python-version }} setuptools pip numpy scipy pandas nose source activate test-environment conda install -c conda-forge cython pyyaml - conda install -c conda-forge -c bioconda samtools pysam bioframe + conda install -c bioconda samtools + pip install pysam bioframe # workaround: incompatible with python3.10 if installed by conda pip install click python setup.py build_ext -i diff --git a/doc/formats.rst b/doc/formats.rst index 964a7194..a778c994 100644 --- a/doc/formats.rst +++ b/doc/formats.rst @@ -158,26 +158,27 @@ We provide `pairtools header` utilities for manipulating and verifying compatibi The list of additional columns used throughout `pairtools` modules: -+=====+=====+=====+=====+=====+=====+ -| extra column | generating module | format | how to add it | description | -+=====+=====+=====+=====+=====+=====+ -| mapq1, mapq2 | `parse/parse2` | number from 0 to 255 | `pairtools parse --add-columns mapq` | [Mapping quality](https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/) of alignment, as reported in .sam/.bam, usually $-10*log_{10}(P(mapping position is wrong))$ | -| pos51, pos52 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos5` | 5' position of alignment (closer to read start) | -| pos31, pos32 | `parse/parse2` | genomic coordinate | `pairtools parse --add-columns pos3` | 3' position of alignment (further from read start) | -| cigar1, cigar2 | `parse/parse2` | string | `pairtools parse --add-columns cigar` | [CIGAR, or Compact Idiosyncratic Gapped Alignment Report](https://en.wikipedia.org/wiki/Sequence_alignment#CIGAR_Format) of alignment, as reported in .sam/.bam | -| read_len1, read_len2 | `parse/parse2` | number | `pairtools parse --add-columns read_len` | read length | -| matched_bp1, matched_bp2 | `parse/parse2` | number | `pairtools parse --add-columns matched_bp` | number of matched alignment basepairs to the reference | -| algn_ref_span1, algn_ref_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_ref_span` | basepairs of reference covered by alignment | -| algn_read_span1, algn_read_span2 | `parse/parse2` | number | `pairtools parse --add-columns algn_read_span` | basepairs of read covered by alignment | -| dist_to_51, dist_to_52 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_5` | distance to 5'-end of read | -| dist_to_31, dist_to_32 | `parse/parse2` | number | `pairtools parse --add-columns dist_to_3` | distance to 3'-end of read | -| seq1, seq2 | `parse/parse2` | string | `pairtools parse --add-columns seq` | sequence of alignment | -| mismatches1, mismatches2 | `parse/parse2` | string | `pairtools parse --add-columns mismatches` | comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" | -| XB1/2,AS1/2,XS1/2 or any sam tag | `parse/parse2` | format depends on [tag specification](https://samtools.github.io/hts-specs/SAMv1.pdf) | `pairtools parse --add-columns XA,XB,NM` | -| walk_pair_type | `parse/parse2` | string | `pairtools parse2 --add-pair-index` | Type of the pair relative to R1 and R2 reads of paired-end sequencing, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | -| walk_pair_index | `parse/parse2` | number | `pairtools parse2 --add-pair-index` | Order of the pair in the complex walk, starting from 5'-end of left read, see [pasring docs](https://pairtools.readthedocs.io/en/latest/parsing.html#rescuing-complex-walks) | -| phase | `phase` | 0, 1 or "." | `pairtools phase` | Phase of alignment (haplotype 1, 2, on unphased), see [phasing walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_phase_walkthrough.html) | -| rfrag1, rfrag2 | `restrict` | number | `pairtools restrict` | Unique index of the restriction fragment after annotating pairs positions, see [restriction walkthrough](https://pairtools.readthedocs.io/en/latest/examples/pairtools_restrict_walkthrough.html) | -| rfrag_start1, rfrag_start2 | `restrict` | number | `pairtools restrict` | Coordinate of the start of restriction fragment | -| rfrag_end1, rfrag_end2 | `restrict` | number | `pairtools restrict` | Coordinate of the end of restriction fragment | -+=====+=====+=====+=====+=====+=====+ +=================================== =================== ====================== ================================================== ================= +extra column generating module format how to add it description +=================================== =================== ====================== ================================================== ================= +mapq1, mapq2 `parse/parse2` number from 0 to 255 `pairtools parse --add-columns mapq` `Mapping quality `_, as reported in .sam/.bam, $-10 log_{10}(P_{error})$ +pos51, pos52 `parse/parse2` genomic coordinate `pairtools parse --add-columns pos5` 5' position of alignment (closer to read start) +pos31, pos32 `parse/parse2` genomic coordinate `pairtools parse --add-columns pos3` 3' position of alignment (further from read start) +cigar1, cigar2 `parse/parse2` string `pairtools parse --add-columns cigar` `CIGAR, or Compact Idiosyncratic Gapped Alignment Report `_ of alignment, as reported in .sam/.bam +read_len1, read_len2 `parse/parse2` number `pairtools parse --add-columns read_len` read length +matched_bp1, matched_bp2 `parse/parse2` number `pairtools parse --add-columns matched_bp` number of matched alignment basepairs to the reference +algn_ref_span1, algn_ref_span2 `parse/parse2` number `pairtools parse --add-columns algn_ref_span` basepairs of reference covered by alignment +algn_read_span1, algn_read_span2 `parse/parse2` number `pairtools parse --add-columns algn_read_span` basepairs of read covered by alignment +dist_to_51, dist_to_52 `parse/parse2` number `pairtools parse --add-columns dist_to_5` distance to 5'-end of read +dist_to_31, dist_to_32 `parse/parse2` number `pairtools parse --add-columns dist_to_3` distance to 3'-end of read +seq1, seq2 `parse/parse2` string `pairtools parse --add-columns seq` sequence of alignment +mismatches1, mismatches2 `parse/parse2` string `pairtools parse --add-columns mismatches` comma-separated list of mismatches relative to the reference, "{ref_letter}:{mut_letter}:{phred}:{ref_position}:{read_position}" +XB1/2,AS1/2,XS1/2 or any sam tag `parse/parse2` `pairtools parse --add-columns XA,XB,NM` format depends on `tag specification `_ +walk_pair_type `parse/parse2` string `pairtools parse2 --add-pair-index` Type of the pair relative to R1 and R2 reads of paired-end sequencing, see `pasring docs `_ +walk_pair_index `parse/parse2` number `pairtools parse2 --add-pair-index` Order of the pair in the complex walk, starting from 5'-end of left read, see `pasring docs `_ +phase `phase` 0, 1 or "." `pairtools phase` Phase of alignment (haplotype 1, 2, on unphased), see `phasing walkthrough `_ +rfrag1, rfrag2 `restrict` number `pairtools restrict` Unique index of the restriction fragment after annotating pairs positions, see `restriction walkthrough `_ +rfrag_start1, rfrag_start2 `restrict` number `pairtools restrict` Coordinate of the start of restriction fragment +rfrag_end1, rfrag_end2 `restrict` number `pairtools restrict` Coordinate of the end of restriction fragment +=================================== =================== ====================== ================================================== ================= + From 1b18a5cf87a7b58fb9103f98a7fea6b2f8083a97 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Sun, 3 Jul 2022 15:40:40 -0400 Subject: [PATCH 09/12] README updates: links updated; Slack link invotation posted; new tools described in the main page --- README.md | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e5d5a276..f9746f9e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Documentation Status](https://readthedocs.org/projects/pairtools/badge/?version=latest)](http://pairtools.readthedocs.org/en/latest/) [![Build Status](https://travis-ci.org/mirnylab/pairtools.svg?branch=master)](https://travis-ci.org/mirnylab/pairtools) -[![Join the chat at https://gitter.im/mirnylab/distiller](https://badges.gitter.im/mirnylab/distiller.svg)](https://gitter.im/mirnylab/distiller?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Join the chat at Open2C Slack](https://join.slack.com/t/open2c/shared_invite/zt-1buzt2uyt-FLGp8~~MvbTCig5sQZTSNQ)](https://join.slack.com/t/open2c/shared_invite/zt-1buzt2uyt-FLGp8~~MvbTCig5sQZTSNQ) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1490831.svg)](https://doi.org/10.5281/zenodo.1490831) ## Process Hi-C pairs with pairtools @@ -19,9 +19,12 @@ operations: - generate extensive statistics of Hi-C datasets - select Hi-C pairs given flexibly defined criteria - restore .sam alignments from Hi-C pairs +- annotate restriction digestion sites +- get the mutated positions in Hi-C pairs To get started: -- Take a look at a [quick example](https://github.com/open2c/pairtools#quick-example) +- Visit [pairtools tutorials](https://pairtools.readthedocs.io/en/latest/examples/pairtools_walkthrough.html), +- Take a look at a [quick example](https://github.com/open2c/pairtools#quick-example), - Check out the detailed [documentation](http://pairtools.readthedocs.io). ## Data formats @@ -32,18 +35,23 @@ format defined by the [4D Nucleome Consortium](https://www.4dnucleome.org/). All pairtools properly manage file headers and keep track of the data processing history. -Additionally, `pairtools` define the .pairsam format, an extension of .pairs that includes the SAM alignments +Additionally, `pairtools` define the [.pairsam format](https://pairtools.readthedocs.io/en/latest/formats.html#pairsam), an extension of .pairs that includes the SAM alignments of a sequenced Hi-C molecule. .pairsam complies with the .pairs format, and can be processed by any tool that operates on .pairs files. +`pairtools` produces a set of additional extra columns, which describe properties of alignments, phase, mutations, restriction and complex walks. +The full list of possible extra columns is provided in the [`pairtools` format specification](https://pairtools.readthedocs.io/en/latest/formats.html#extra-columns). + ## Installation Requirements: - Python 3.x -- Python packages `cython`, `numpy` and `click`. +- Python packages `cython`, `pysam`, `bioframe`, `pyyaml`, `numpy`, `scipy`, `pandas` and `click`. - Command-line utilities `sort` (the Unix version), `bgzip` (shipped with `samtools`) and `samtools`. If available, `pairtools` can compress outputs with `pbgzip` and `lz4`. +For the full list of recommended versions, see [requirements in the the GitHub repo](https://github.com/open2c/pairtools/blob/detect_mutations/requirements.txt). + We highly recommend using the `conda` package manager to install `pairtools` together with all its dependencies. To get it, you can either install the full [Anaconda](https://www.continuum.io/downloads) Python distribution or just the standalone [conda](http://conda.pydata.org/miniconda.html) package manager. With `conda`, you can install `pairtools` and all of its dependencies from the [bioconda](https://bioconda.github.io/index.html) channel. @@ -91,21 +99,27 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex ## Tools -- `parse`: read .sam files produced by bwa and form Hi-C pairs +- `parse`: read .sam/.bam files produced by bwa and form Hi-C pairs - form Hi-C pairs by reporting the outer-most mapped positions and the strand on the either side of each molecule; - report unmapped/multimapped (ambiguous alignments)/chimeric alignments as chromosome "!", position 0, strand "-"; - - identify and rescue chrimeric alignments produced by singly-ligated Hi-C - molecules with a sequenced ligation junction on one of the sides; - - annotate chimeric alignments by restriction fragments and report true junctions and hops (One-Read-Based Interactions Annotation, ORBITA); - perform upper-triangular flipping of the sides of Hi-C molecules such that the first side has a lower sorting index than the second side; - form hybrid pairsam output, where each line contains all available data for one Hi-C molecule (outer-most mapped positions on the either side, read ID, pair type, and .sam entries for each alignment); + - report .sam tags or mutations of the alignments; - print the .sam header as #-comment lines at the start of the file. +- `parse2`: read .sam/.bam files with long paired-and or single-end reads and form Hi-C pairs from complex walks + - identify and rescue chrimeric alignments produced by singly-ligated Hi-C + molecules with a sequenced ligation junction on one of the sides; + - annotate chimeric alignments by restriction fragments and report true junctions and hops (One-Read-Based Interactions Annotation, ORBITA); + - perform intra-molecule deduplication of paired-end data when one side reads through the DNA on the other side of the read; + - report index of the pair in the complex walk; + - make combinatorial expansion of pairs produced from the same walk; + - `sort`: sort pairs files (the lexicographic order for chromosomes, the numeric order for the positions, the lexicographic order for pair types). @@ -125,7 +139,10 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex - `dedup`: remove PCR duplicates from a sorted triu-flipped .pairs file - remove PCR duplicates by finding pairs of entries with both sides mapped to similar genomic locations (+/- N bp); - - optionally output the PCR duplicate entries into a separate file. + - optionally output the PCR duplicate entries into a separate file; + - detect optical duplicates from the original Illumina read ids; + - apply filtering by various properties of pairs (MAPQ; orientation; distance) together with deduplication; + - output yaml or convenient tsv deduplication stats into text file. - NOTE: in order to remove all PCR duplicates, the input must contain \*all\* mapped read pairs from a single experimental replicate; @@ -136,10 +153,20 @@ Hi-C data analysis workflow, based on `pairtools` and [nextflow](https://www.nex - `split`: split a .pairsam file into .pairs and .sam. +- `flip`: flip pairs to get an upper-triangular matrix + +- `header`: manipulate the .pairs/.pairsam header + - generate new header for headerless .pairs file + - transfer header from one .pairs file to another + - set column names for the .pairs file + - validate that the header corresponds to the information stored in .pairs file + - `stats`: calculate various statistics of .pairs files - `restrict`: identify the span of the restriction fragment forming a Hi-C junction +- `phase`: phase pairs mapped to a diploid genome + ## Contributing [Pull requests](https://akrabat.com/the-beginners-guide-to-contributing-to-a-github-project/) are welcome. From 498ce78a98b05d12a1f5e5b2e52deee6625b1e79 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Mon, 4 Jul 2022 05:47:41 -0400 Subject: [PATCH 10/12] calculate mutations on demand --- pairtools/lib/parse.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/pairtools/lib/parse.py b/pairtools/lib/parse.py index 76503679..55f5acbb 100644 --- a/pairtools/lib/parse.py +++ b/pairtools/lib/parse.py @@ -125,6 +125,7 @@ def streaming_classify( walks_policy=kwargs["walks_policy"], sam_tags=sam_tags, store_seq=store_seq, + report_mismatches=True if 'mismatches' in add_columns else False, ) else: # parse2 parser: pairstream, all_algns1, all_algns2 = parse2_read( @@ -141,6 +142,7 @@ def streaming_classify( store_seq=store_seq, expand=kwargs["expand"], max_expansion_depth=kwargs["max_expansion_depth"], + report_mismatches=True if 'mismatches' in add_columns else False, ) ### Write: @@ -245,14 +247,21 @@ def empty_alignment(): def parse_pysam_entry( - sam, min_mapq, sam_tags=None, store_seq=False, report_3_alignment_end=False + sam, + min_mapq, + sam_tags=None, + store_seq=False, + report_3_alignment_end=False, + report_mismatches=False ): """Parse alignments from pysam AlignedSegment entry :param sam: input pysam AlignedSegment entry :param min_mapq: minimal MAPQ to consider as a proper alignment :param sam_tags: list of sam tags to store :param store_seq: if True, the sequence will be parsed and stored in the output - :param report_3_alignment_end: if True, 3'-end of alignment will be reported as position (will be deprecated) + :param report_3_alignment_end: if True, 3'-end of alignment will be + reported as position (will be deprecated) + :param report_mismatches: if True, mismatches will be parsed from MD field :return: parsed aligned entry (dictionary) """ @@ -284,7 +293,7 @@ def parse_pysam_entry( pos3 = sam.reference_start + 1 # Get number of matches: - if not sam.has_tag("MD"): + if not sam.has_tag("MD") or not report_mismatches: mismatches = "" else: seq = sam.query_sequence.upper() @@ -408,6 +417,7 @@ def parse_read( walks_policy, sam_tags, store_seq, + report_mismatches=False, ): """ Parse sam entries corresponding to a single read (or Hi-C molecule) @@ -441,8 +451,8 @@ def parse_read( return iter([(algns1[0], algns2[0], pair_index)]), algns1, algns2 # Generate a sorted, gap-filled list of all alignments - algns1 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams1] - algns2 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams2] + algns1 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams1] + algns2 = [parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2] if len(algns1) > 0: algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) @@ -560,6 +570,7 @@ def parse2_read( sam_tags=[], dedup_max_mismatch=3, store_seq=False, + report_mismatches=False, expand=False, max_expansion_depth=None, ): @@ -582,7 +593,7 @@ def parse2_read( if single_end: # Generate a sorted, gap-filled list of all alignments algns1 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2 # note sams2, that's how these reads are typically parsed ] algns1 = sorted(algns1, key=lambda algn: algn["dist_to_5"]) @@ -632,10 +643,10 @@ def parse2_read( # Generate a sorted, gap-filled list of all alignments algns1 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams1 + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams1 ] algns2 = [ - parse_pysam_entry(sam, min_mapq, sam_tags, store_seq) for sam in sams2 + parse_pysam_entry(sam, min_mapq, sam_tags, store_seq, report_mismatches=report_mismatches) for sam in sams2 ] # Sort alignments by the distance to the 5'-end: From 521425b681c125c0cbcb928f7bbec84360b2c220 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Thu, 7 Jul 2022 14:11:29 -0400 Subject: [PATCH 11/12] flipping is off for parse2 by default. Important notes on flipping recommendations. --- pairtools/cli/parse2.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pairtools/cli/parse2.py b/pairtools/cli/parse2.py index 1743d123..725a5bec 100644 --- a/pairtools/cli/parse2.py +++ b/pairtools/cli/parse2.py @@ -130,9 +130,11 @@ @click.option( "--flip/--no-flip", is_flag=True, - default=True, - help="If specified, do not flip pairs in genomic order and instead preserve " - "the order in which they were sequenced.", + default=False, + help="If specified, flip pairs in genomic order and instead preserve " + "the order in which they were sequenced. Note that no flip is recommended for analysis of walks because it will " + "override the order of alignments in pairs. Flip is required for appropriate deduplication of sorted pairs. " + "Flip is not required for cooler cload, which runs flipping internally. ", ) @click.option( "--drop-readid/--keep-readid", From a1d0738c8dfff0c3562ecc94b13daf3ef5f6f3f2 Mon Sep 17 00:00:00 2001 From: Aleksandra Galitsyna Date: Thu, 7 Jul 2022 15:26:05 -0400 Subject: [PATCH 12/12] test flip added --- tests/test_parse2.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_parse2.py b/tests/test_parse2.py index cef2c912..37075ae1 100644 --- a/tests/test_parse2.py +++ b/tests/test_parse2.py @@ -22,6 +22,7 @@ def test_mock_pysam_parse2_read(): "-c", mock_chroms_path, "--add-pair-index", + "--flip", "--report-position", "junction", "--report-orientation", @@ -81,6 +82,7 @@ def test_mock_pysam_parse2_pair(): "-c", mock_chroms_path, "--add-pair-index", + "--flip", "--report-position", "outer", "--report-orientation",